|
|
ceph 出现告警,解决流程:
# B7 m0 _ T* Y! X7 W* d( J[root@hostceph1 ~]# ceph health detail 0 s! F4 U9 v4 y3 @, d
HEALTH_WARN 1 daemons have recently crashed4 M( H5 m) b( q) ~' d" u* a
RECENT_CRASH 1 daemons have recently crashed
" Y A/ d" O% D7 f osd.29 crashed on host compute08 at 2022-03-01 10:31:17.079004Z0 m' h# [" r( K( k' x
, M4 o4 h2 B3 c% \3 Y( o
. s3 k; e0 k" E, G! y O# s[root@hostceph1 ~]# ceph crash ls-new
# q/ u. h+ g0 p2 a+ `ID ENTITY NEW 9 F4 q0 V9 E6 I/ M
2022-03-01_10:31:17.079004Z_11fa7732-990f-4166-8de5-943ff6f07c10 osd.29 * 2 u$ \6 Q6 l7 t/ Y# L5 E* v
[root@hostceph1 ~]# ceph crash info 2022-03-01_10:31:17.079004Z_11fa7732-990f-4166-8de5-943ff6f07c10
6 q7 Z& z) u& D& Y/ Y{9 N( S; u9 ?- C( l/ p: D" X
"os_version_id": "7", 7 V3 L; Y8 O! K1 y, N
"assert_condition": "e.version > info.last_update", ) r# S$ r( I9 R4 h0 e" A
"utsname_release": "3.10.0-1160.el7.x86_64",
, ^, ^" E( {% A; K3 O o8 Z "os_name": "CentOS Linux", " q U1 Q( z% g% q7 @( ?7 h
"entity_name": "osd.29", : H/ D$ {4 Z. u# n i. ~7 l
"assert_file": "/home/miles/rpmbuild/BUILD/ceph-14.2.8/src/osd/PG.cc", 7 S) B0 Q- N+ S% A$ _& B
"timestamp": "2022-03-01 10:31:17.079004Z",
0 u4 J+ R; p2 I$ a& s) {0 k, O$ y7 f "process_name": "ceph-osd",
& a% k# \+ X7 k0 P) A "utsname_machine": "x86_64", ( E# A6 w1 e @* D9 L L
"assert_line": 3964,
' o7 E1 C. [ N "utsname_sysname": "Linux",
3 D" Y6 X8 `& |5 k# o- {2 V: p "os_version": "7 (Core)",
" Y0 n3 x5 V/ ]6 ]1 c8 _ "os_id": "centos", 0 `0 N. _. B; {( a
"assert_thread_name": "tp_osd_tp",
1 \% Q) c" W- Z/ x- W1 s9 k "utsname_version": "#1 SMP Wed Nov 18 03:43:48 UTC 2020",
5 h% @* r: e! ]% H: K "backtrace": [
$ ^9 k2 A5 S8 b( }+ [: ~ "(()+0xf630) [0x7fb551f8f630]",
8 E0 C, }2 K+ X$ t% v) Y "(gsignal()+0x37) [0x7fb550d82387]", F" A1 F% B7 z8 o/ R ?
"(abort()+0x148) [0x7fb550d83a78]",
* J. @6 T& p. v6 R0 B2 N g5 X R "(ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x199) [0x55adc93aa704]",
. k& r! F; }4 y9 i: Q% ` "(()+0x4cc87d) [0x55adc93aa87d]",
# \% E' H4 x! g- E "(PG::add_log_entry(pg_log_entry_t const&, bool)+0x1f5) [0x55adc953f3f5]", ! A1 H# |) p4 J E* Q! O4 x
"(PG::append_log(std::vector<pg_log_entry_t, std::allocator<pg_log_entry_t> > const&, eversion_t, eversion_t, ObjectStore::Transaction&, bool, bool)+0x10b) [0x55adc956f01b]", ! \ ^$ w# N! u4 K
"(non-virtual thunk to PrimaryLogPG::log_operation(std::vector<pg_log_entry_t, std::allocator<pg_log_entry_t> > const&, boost::optional<pg_hit_set_history_t> const&, eversion_t const&, eversion_t const&, bool, ObjectStore::Transaction&, bool)+0x95) [0x55adc96598a5]",
# ~ H+ b7 s0 c0 O) Y. S+ | "(ReplicatedBackend::do_repop(boost::intrusive_ptr<OpRequest>)+0xaa9) [0x55adc977a7a9]",
. v9 D8 `3 M' k# Q. Z) A$ i+ K; V "(ReplicatedBackend::_handle_message(boost::intrusive_ptr<OpRequest>)+0x257) [0x55adc9788f57]",
! o, _$ ~ t) H* _7 L: R "(PGBackend::handle_message(boost::intrusive_ptr<OpRequest>)+0x4a) [0x55adc9699dea]", ! }5 H4 t" K8 l+ ]/ _' f& K
"(PrimaryLogPG::do_request(boost::intrusive_ptr<OpRequest>&, ThreadPool::TPHandle&)+0x5b3) [0x55adc964a1d3]", 0 D: S8 f5 A& m$ M9 i
"(OSD::dequeue_op(boost::intrusive_ptr<PG>, boost::intrusive_ptr<OpRequest>, ThreadPool::TPHandle&)+0x362) [0x55adc948ab62]", 0 A4 p" Q6 p. F; n4 F
"(PGOpItem::run(OSD*, OSDShard*, boost::intrusive_ptr<PG>&, ThreadPool::TPHandle&)+0x62) [0x55adc9719752]", 6 ]3 m( ]% a* z; p A! x
"(OSD::ShardedOpWQ::_process(unsigned int, ceph::heartbeat_handle_d*)+0x90f) [0x55adc94a5b5f]", % P* z' P: r3 R! Q4 _+ i; [; p5 @
"(ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x5b6) [0x55adc9a49dd6]", @3 ^1 }3 t. V8 O
"(ShardedThreadPool::WorkThreadSharded::entry()+0x10) [0x55adc9a4c8f0]",
6 D, Z8 p& S; ^5 `- E* M4 { "(()+0x7ea5) [0x7fb551f87ea5]", 7 S# C6 Z( q0 l7 N
"(clone()+0x6d) [0x7fb550e4a9fd]"6 \3 @ H" D- K' D
],
6 }# Y7 p; F/ m "utsname_hostname": "compute08",
# r; i, `6 i* F9 I4 \ "assert_msg": "/home/miles/rpmbuild/BUILD/ceph-14.2.8/src/osd/PG.cc: In function 'void PG::add_log_entry(const pg_log_entry_t&, bool)' thread 7fb52ad89700 time 2022-03-01 18:31:17.054438\n/home/miles/rpmbuild/BUILD/ceph-14.2.8/src/osd/PG.cc: 3964: FAILED ceph_assert(e.version > info.last_update)\n",
+ P6 a W( k, Q9 J "crash_id": "2022-03-01_10:31:17.079004Z_11fa7732-990f-4166-8de5-943ff6f07c10",
3 x: S( U+ L4 X& m% a- S "assert_func": "void PG::add_log_entry(const pg_log_entry_t&, bool)",
+ I. o' f0 z! V" c "ceph_version": "14.2.8-111.el7"
$ ~. S5 H8 b8 b8 C# B. R R}8 N6 [" u; R2 k! B8 Z0 |- z
# y6 T8 ~; g: o& a3 s r4 H( q; R
" Z' i* p# @8 C6 W' \0 z w
[root@hostceph1 ~]# ceph crash archive 2022-03-01_10:31:17.079004Z_11fa7732-990f-4166-8de5-943ff6f07c106 J2 T1 q* U/ B
[root@hostceph1 ~]# ceph health detail
+ l1 L0 O# w; _+ a& A3 h7 PHEALTH_OK j+ q# L0 v1 k* q$ i
: W( ~& Z1 J9 Z$ ~1 K+ G
- e8 u9 ^$ ]. N* g/ r$ c( F+ J( S' M. X4 a5 ]
解决完成。
! I; l. D' a4 S0 c/ u* w, r- C! m- h9 f3 o
以下只是查看命令:
) n) ?$ q- g7 }' C8 m+ e; Y/ _[root@hostceph1 ~]# ceph config get mgr/crash/warn_recent_interval
) S# m% I6 ?$ P# P5 y% H$ eError EINVAL: unrecognized entity 'mgr/crash/warn_recent_interval'
. _# N4 ]9 s5 [0 j[root@hostceph1 ~]# ceph get mgr/crash/warn_recent_interval 0 C8 \ z9 `# I: j/ g& [
no valid command found; 10 closest matches:
# D9 C1 O, }/ r# {osd pause
+ v- e$ {4 Q/ w; K5 q, |8 vosd unpause
Y0 N7 X7 W) aosd get-require-min-compat-client
, b( g1 @. C5 h0 d4 a- Sosd set-require-min-compat-client <version> {--yes-i-really-mean-it}2 E, x. A. R0 |4 a( i
osd set-backfillfull-ratio <float[0.0-1.0]>
6 m, B9 h6 n& C: H; sosd set-nearfull-ratio <float[0.0-1.0]>
' y; o+ Q+ Z9 p0 e, Gmds count-metadata <property>9 X5 i n$ @5 _0 P
mds metadata {<who>}
5 }, J- e9 J5 @fs dump {<int[0-]>}
9 p9 N) O5 x7 w1 b' U" pversions
5 l. i3 c: ~2 j! [Error EINVAL: invalid command2 C4 D4 I5 z) J$ `% @5 U
[root@hostceph1 ~]# ceph config set mgr/crash/warn_recent_interval 0
k3 N# @0 k9 {& X: JInvalid command: missing required parameter value(<string>)4 F/ _# z- M8 l) P+ C
config set <who> <name> <value> {--force} : Set a configuration option for one or more entities
p X; S5 w. Q( W8 OError EINVAL: invalid command( i3 E8 u6 R' q+ m- P+ e
[root@hostceph1 ~]# ceph crash archive-all
; H& c- r* v. w" b+ x. w) w: m9 ]: C[root@hostceph1 ~]# ceph -s: Q9 A. s% D. u+ e( W/ ^$ D
cluster:
9 F) S; s9 y. M id: 29046cc0-0682-496b-98b1-912e59964282
# i' n: M6 T: S Q: s% v health: HEALTH_OK0 b, i9 L. Q: Q9 K) D1 l$ r5 }1 U! Z
# w$ w1 M- A5 `6 M' z
services:7 j0 o* |7 [& Z# z0 |. v1 }/ _# w! c, W
mon: 3 daemons, quorum hostceph1,hostceph2,hostceph3 (age 27m)7 v `+ @0 v) b6 P5 h/ T2 `
mgr: hostceph1(active, since 53m), standbys: hostceph2, hostceph3
" {! k6 i2 s6 L8 [ osd: 34 osds: 34 up (since 27m), 34 in (since 45m)
+ M" `$ Q; j1 d9 ]. v# a
5 @9 o7 i, P; d" G- [- P data:1 }4 Z' K* {, Z9 o; Y- \
pools: 9 pools, 9344 pgs# H% P9 S$ B7 r% w
objects: 1.21M objects, 4.6 TiB
2 W5 Q, F9 Q7 a( {' p' f2 R usage: 16 TiB used, 110 TiB / 126 TiB avail+ i. S) J% i4 Y m3 Q2 R
pgs: 9344 active+clean6 ?3 I5 R6 W$ _& d
4 {2 |/ v6 H+ G* n* Z io:
3 W/ @. ]/ p$ _6 q+ F. k client: 2.7 KiB/s rd, 13 MiB/s wr, 0 op/s rd, 97 op/s wr
1 k) e- {) s) @! [' ~# {6 |) P |
|