|
|
ceph 出现告警,解决流程:
7 ?' c0 t9 M9 `[root@hostceph1 ~]# ceph health detail
8 A9 D# ^$ c- I6 a0 ^HEALTH_WARN 1 daemons have recently crashed
( Z' o' c" f* y& ^RECENT_CRASH 1 daemons have recently crashed0 Z% O! e& u3 Y. ^2 o: G9 t
osd.29 crashed on host compute08 at 2022-03-01 10:31:17.079004Z, f8 E4 q& [1 e) i; X) d
: r+ H8 T+ ], x \8 S0 t 4 k) `" a+ H+ l% U& L/ Z
[root@hostceph1 ~]# ceph crash ls-new
# `6 o: J0 H' w% y: m2 }5 U+ dID ENTITY NEW
) r6 z" e5 ^2 Y* K! {% i2022-03-01_10:31:17.079004Z_11fa7732-990f-4166-8de5-943ff6f07c10 osd.29 * : Z. h2 u$ k/ a5 n+ F
[root@hostceph1 ~]# ceph crash info 2022-03-01_10:31:17.079004Z_11fa7732-990f-4166-8de5-943ff6f07c10
! ~/ l6 D9 T; U8 K$ s5 H) X- J& D( _{- ~/ ^1 e, O* L7 @
"os_version_id": "7", - G7 l+ G+ }1 w5 }/ f& b/ A
"assert_condition": "e.version > info.last_update",
' k z$ X0 L) G" A1 F "utsname_release": "3.10.0-1160.el7.x86_64", & W+ K" H) {$ d5 b. y. b* x
"os_name": "CentOS Linux", 0 ^" \7 M6 w9 T/ f" ?6 o
"entity_name": "osd.29",
% H& p" B/ i( v, U- j "assert_file": "/home/miles/rpmbuild/BUILD/ceph-14.2.8/src/osd/PG.cc",
N) X8 H0 J9 b9 T' u "timestamp": "2022-03-01 10:31:17.079004Z",
% t9 F* @2 h) R* _ m5 d9 b "process_name": "ceph-osd", 8 \5 ?" g8 g7 |5 n
"utsname_machine": "x86_64",
O% S$ a, b, O! o "assert_line": 3964, ) S5 J4 \" D3 g
"utsname_sysname": "Linux",
3 p! t0 Z& \9 z2 I "os_version": "7 (Core)",
2 T3 W2 ]9 L% w- C' T( \$ q3 Y: W "os_id": "centos", ! g3 n) d. t" p
"assert_thread_name": "tp_osd_tp", 1 M) s! ^& ^+ ]4 f
"utsname_version": "#1 SMP Wed Nov 18 03:43:48 UTC 2020", " v/ r; @, X% G* h
"backtrace": [' ~+ C& W0 e# Y( L2 ?
"(()+0xf630) [0x7fb551f8f630]",
% t% r7 a6 H, ?2 W2 o5 D; f "(gsignal()+0x37) [0x7fb550d82387]",
) k' S& L* v2 B0 E4 P0 X "(abort()+0x148) [0x7fb550d83a78]",
3 i% f, @: y. \% u" g8 f "(ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x199) [0x55adc93aa704]", - i$ O6 c7 c/ e6 C
"(()+0x4cc87d) [0x55adc93aa87d]",
1 z: J, v" r/ l5 x0 F "(PG::add_log_entry(pg_log_entry_t const&, bool)+0x1f5) [0x55adc953f3f5]", 2 D M) }, \& [4 k8 X9 M
"(PG::append_log(std::vector<pg_log_entry_t, std::allocator<pg_log_entry_t> > const&, eversion_t, eversion_t, ObjectStore::Transaction&, bool, bool)+0x10b) [0x55adc956f01b]",
( r( K7 ]7 E7 W8 n2 {1 R8 l "(non-virtual thunk to PrimaryLogPG::log_operation(std::vector<pg_log_entry_t, std::allocator<pg_log_entry_t> > const&, boost::optional<pg_hit_set_history_t> const&, eversion_t const&, eversion_t const&, bool, ObjectStore::Transaction&, bool)+0x95) [0x55adc96598a5]",
& v+ C) p7 b, q, S( ]8 z" g "(ReplicatedBackend::do_repop(boost::intrusive_ptr<OpRequest>)+0xaa9) [0x55adc977a7a9]",
) m! G! H. J& s+ y b) i+ u "(ReplicatedBackend::_handle_message(boost::intrusive_ptr<OpRequest>)+0x257) [0x55adc9788f57]", * I6 ?* y* m4 a
"(PGBackend::handle_message(boost::intrusive_ptr<OpRequest>)+0x4a) [0x55adc9699dea]", 0 Y0 Y4 l) W* x! ]
"(PrimaryLogPG::do_request(boost::intrusive_ptr<OpRequest>&, ThreadPool::TPHandle&)+0x5b3) [0x55adc964a1d3]",
x, [+ A% m. x# @ "(OSD::dequeue_op(boost::intrusive_ptr<PG>, boost::intrusive_ptr<OpRequest>, ThreadPool::TPHandle&)+0x362) [0x55adc948ab62]", 8 Q) [; B. n. O, y
"(PGOpItem::run(OSD*, OSDShard*, boost::intrusive_ptr<PG>&, ThreadPool::TPHandle&)+0x62) [0x55adc9719752]",
" p8 K0 B! W( d "(OSD::ShardedOpWQ::_process(unsigned int, ceph::heartbeat_handle_d*)+0x90f) [0x55adc94a5b5f]", 9 A4 x3 k5 V% {) L- O' j
"(ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x5b6) [0x55adc9a49dd6]",
; Z, @; h) _# d1 B/ c, g "(ShardedThreadPool::WorkThreadSharded::entry()+0x10) [0x55adc9a4c8f0]", - {$ E6 P7 A& O3 o ^
"(()+0x7ea5) [0x7fb551f87ea5]",
5 e+ Q: w3 F. K$ y "(clone()+0x6d) [0x7fb550e4a9fd]"2 w ~1 `' W Z' l
], 4 U1 |1 J7 L) |5 w+ l$ u: f$ i
"utsname_hostname": "compute08", * l% G$ B8 l) D6 Q
"assert_msg": "/home/miles/rpmbuild/BUILD/ceph-14.2.8/src/osd/PG.cc: In function 'void PG::add_log_entry(const pg_log_entry_t&, bool)' thread 7fb52ad89700 time 2022-03-01 18:31:17.054438\n/home/miles/rpmbuild/BUILD/ceph-14.2.8/src/osd/PG.cc: 3964: FAILED ceph_assert(e.version > info.last_update)\n",
3 H) B. H" y; a2 [2 g. z "crash_id": "2022-03-01_10:31:17.079004Z_11fa7732-990f-4166-8de5-943ff6f07c10", 3 s$ `% h) [5 Q# c8 P' L
"assert_func": "void PG::add_log_entry(const pg_log_entry_t&, bool)",
" p$ e |* I- M* G; l {2 P4 H, W "ceph_version": "14.2.8-111.el7"
2 @! C4 E5 S8 |4 i+ i. I}
' }9 X+ B, V) z! f# g# J3 P% w' O- z5 S2 o4 h2 f
# B: ]' }% n. k% Z7 S, a; P
[root@hostceph1 ~]# ceph crash archive 2022-03-01_10:31:17.079004Z_11fa7732-990f-4166-8de5-943ff6f07c10
' i9 Z! W' g, e! x4 c& b+ ]/ k/ x[root@hostceph1 ~]# ceph health detail
2 l: E* a6 g# P) u! Z: Q8 [+ F0 jHEALTH_OK
2 z* k/ g) b2 n5 S( v. \1 K
: T U' L N. n& x O8 R+ b" p% k$ ^+ P
7 C" {& z- K* V解决完成。- n0 S' c% C3 p1 D; s$ Y* p
9 y( ~; p) C) G5 U( o, Y4 O
以下只是查看命令:7 w) |8 w4 u) K- D+ y6 @
[root@hostceph1 ~]# ceph config get mgr/crash/warn_recent_interval 3 I [( z$ r# S# Z7 }. S; @2 X$ n
Error EINVAL: unrecognized entity 'mgr/crash/warn_recent_interval', W4 C6 P/ i0 N
[root@hostceph1 ~]# ceph get mgr/crash/warn_recent_interval
% m3 X" N. m$ l' F1 {, h mno valid command found; 10 closest matches:* e2 \$ u: u0 t" P6 q% c( E
osd pause
/ Q" ?" u9 T+ J$ C+ R1 Iosd unpause
" m! f0 z1 U% Z) M( t5 Y8 Josd get-require-min-compat-client E; p5 P5 K( p' J8 a. G! S
osd set-require-min-compat-client <version> {--yes-i-really-mean-it}
2 w1 {1 u3 s$ `$ |0 c; D) cosd set-backfillfull-ratio <float[0.0-1.0]># G W9 _9 k- a& K6 Q
osd set-nearfull-ratio <float[0.0-1.0]>0 ~! P: S6 k; _8 Q& y
mds count-metadata <property>3 N- O4 V- U H. L
mds metadata {<who>}
5 y0 T' B' v3 N8 X: X: \4 {5 yfs dump {<int[0-]>}9 a0 u1 |0 X: u+ c9 `% B6 H
versions( Z2 v8 b7 Q. n$ T
Error EINVAL: invalid command
5 v P% g) [: |& q[root@hostceph1 ~]# ceph config set mgr/crash/warn_recent_interval 0
+ V# e' E* r6 |- u0 d( rInvalid command: missing required parameter value(<string>). e; |; R- Z, f+ N& z
config set <who> <name> <value> {--force} : Set a configuration option for one or more entities4 x3 E' Y4 u1 U P9 w, q
Error EINVAL: invalid command1 y) v @7 D' m7 C( c) b/ R+ b, O
[root@hostceph1 ~]# ceph crash archive-all
" M$ T. C2 l2 ~[root@hostceph1 ~]# ceph -s/ l; P" F9 s3 i t4 j; b8 d0 e1 \2 E
cluster:
; ?5 k2 M, f/ P id: 29046cc0-0682-496b-98b1-912e59964282
' Q5 k9 E8 g% k' Q health: HEALTH_OK0 q6 s. G9 P0 u& V8 C
* J z5 r0 v! E1 b! q9 N) g- C services:
, ~8 e% J" ^# i' K! j4 D" V mon: 3 daemons, quorum hostceph1,hostceph2,hostceph3 (age 27m)
D2 u4 k9 Z* b' O$ V1 B A mgr: hostceph1(active, since 53m), standbys: hostceph2, hostceph3
* t% @) A( E5 n9 e( ?5 e' n* W/ K osd: 34 osds: 34 up (since 27m), 34 in (since 45m): o) ?. E* c& y8 }
& S: `. k' g \) i' I1 ?! [
data:
8 }) j% }8 Y" ^) P$ } pools: 9 pools, 9344 pgs8 y6 E4 |& c, |. W" \# n4 l
objects: 1.21M objects, 4.6 TiB4 w1 v& z1 R/ }& H
usage: 16 TiB used, 110 TiB / 126 TiB avail3 ]5 @3 W7 H+ U' R; f& S# C! E' j) b
pgs: 9344 active+clean
; s( a4 k ~. x( N3 }8 P ) [! @; ?8 S, H
io:; D: R8 W$ ^( n9 F( U3 M' J
client: 2.7 KiB/s rd, 13 MiB/s wr, 0 op/s rd, 97 op/s wr
/ R' b x# `6 y# S/ L; s. I0 w |
|