找回密码
 注册
查看: 1315|回复: 0

HEALTH_WARN 1 daemons have recently crashed 解决过程

[复制链接]

1

主题

0

回帖

12

积分

管理员

积分
12
QQ
发表于 2022-3-1 19:01:12 | 显示全部楼层 |阅读模式
ceph 出现告警,解决流程:
7 ?' c0 t9 M9 `[root@hostceph1 ~]# ceph health detail
8 A9 D# ^$ c- I6 a0 ^HEALTH_WARN 1 daemons have recently crashed
( Z' o' c" f* y& ^RECENT_CRASH 1 daemons have recently crashed0 Z% O! e& u3 Y. ^2 o: G9 t
    osd.29 crashed on host compute08 at 2022-03-01 10:31:17.079004Z, f8 E4 q& [1 e) i; X) d

: r+ H8 T+ ], x  \8 S0 t       4 k) `" a+ H+ l% U& L/ Z
[root@hostceph1 ~]# ceph crash ls-new
# `6 o: J0 H' w% y: m2 }5 U+ dID                                                               ENTITY NEW
) r6 z" e5 ^2 Y* K! {% i2022-03-01_10:31:17.079004Z_11fa7732-990f-4166-8de5-943ff6f07c10 osd.29  *  : Z. h2 u$ k/ a5 n+ F
[root@hostceph1 ~]# ceph crash info  2022-03-01_10:31:17.079004Z_11fa7732-990f-4166-8de5-943ff6f07c10
! ~/ l6 D9 T; U8 K$ s5 H) X- J& D( _{- ~/ ^1 e, O* L7 @
    "os_version_id": "7", - G7 l+ G+ }1 w5 }/ f& b/ A
    "assert_condition": "e.version > info.last_update",
' k  z$ X0 L) G" A1 F    "utsname_release": "3.10.0-1160.el7.x86_64", & W+ K" H) {$ d5 b. y. b* x
    "os_name": "CentOS Linux", 0 ^" \7 M6 w9 T/ f" ?6 o
    "entity_name": "osd.29",
% H& p" B/ i( v, U- j    "assert_file": "/home/miles/rpmbuild/BUILD/ceph-14.2.8/src/osd/PG.cc",
  N) X8 H0 J9 b9 T' u    "timestamp": "2022-03-01 10:31:17.079004Z",
% t9 F* @2 h) R* _  m5 d9 b    "process_name": "ceph-osd", 8 \5 ?" g8 g7 |5 n
    "utsname_machine": "x86_64",
  O% S$ a, b, O! o    "assert_line": 3964, ) S5 J4 \" D3 g
    "utsname_sysname": "Linux",
3 p! t0 Z& \9 z2 I    "os_version": "7 (Core)",
2 T3 W2 ]9 L% w- C' T( \$ q3 Y: W    "os_id": "centos", ! g3 n) d. t" p
    "assert_thread_name": "tp_osd_tp", 1 M) s! ^& ^+ ]4 f
    "utsname_version": "#1 SMP Wed Nov 18 03:43:48 UTC 2020", " v/ r; @, X% G* h
    "backtrace": [' ~+ C& W0 e# Y( L2 ?
        "(()+0xf630) [0x7fb551f8f630]",
% t% r7 a6 H, ?2 W2 o5 D; f        "(gsignal()+0x37) [0x7fb550d82387]",
) k' S& L* v2 B0 E4 P0 X        "(abort()+0x148) [0x7fb550d83a78]",
3 i% f, @: y. \% u" g8 f        "(ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x199) [0x55adc93aa704]", - i$ O6 c7 c/ e6 C
        "(()+0x4cc87d) [0x55adc93aa87d]",
1 z: J, v" r/ l5 x0 F        "(PG::add_log_entry(pg_log_entry_t const&, bool)+0x1f5) [0x55adc953f3f5]", 2 D  M) }, \& [4 k8 X9 M
        "(PG::append_log(std::vector<pg_log_entry_t, std::allocator<pg_log_entry_t> > const&, eversion_t, eversion_t, ObjectStore::Transaction&, bool, bool)+0x10b) [0x55adc956f01b]",
( r( K7 ]7 E7 W8 n2 {1 R8 l        "(non-virtual thunk to PrimaryLogPG::log_operation(std::vector<pg_log_entry_t, std::allocator<pg_log_entry_t> > const&, boost::optional<pg_hit_set_history_t> const&, eversion_t const&, eversion_t const&, bool, ObjectStore::Transaction&, bool)+0x95) [0x55adc96598a5]",
& v+ C) p7 b, q, S( ]8 z" g        "(ReplicatedBackend::do_repop(boost::intrusive_ptr<OpRequest>)+0xaa9) [0x55adc977a7a9]",
) m! G! H. J& s+ y  b) i+ u        "(ReplicatedBackend::_handle_message(boost::intrusive_ptr<OpRequest>)+0x257) [0x55adc9788f57]", * I6 ?* y* m4 a
        "(PGBackend::handle_message(boost::intrusive_ptr<OpRequest>)+0x4a) [0x55adc9699dea]", 0 Y0 Y4 l) W* x! ]
        "(PrimaryLogPG::do_request(boost::intrusive_ptr<OpRequest>&, ThreadPool::TPHandle&)+0x5b3) [0x55adc964a1d3]",
  x, [+ A% m. x# @        "(OSD::dequeue_op(boost::intrusive_ptr<PG>, boost::intrusive_ptr<OpRequest>, ThreadPool::TPHandle&)+0x362) [0x55adc948ab62]", 8 Q) [; B. n. O, y
        "(PGOpItem::run(OSD*, OSDShard*, boost::intrusive_ptr<PG>&, ThreadPool::TPHandle&)+0x62) [0x55adc9719752]",
" p8 K0 B! W( d        "(OSD::ShardedOpWQ::_process(unsigned int, ceph::heartbeat_handle_d*)+0x90f) [0x55adc94a5b5f]", 9 A4 x3 k5 V% {) L- O' j
        "(ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x5b6) [0x55adc9a49dd6]",
; Z, @; h) _# d1 B/ c, g        "(ShardedThreadPool::WorkThreadSharded::entry()+0x10) [0x55adc9a4c8f0]", - {$ E6 P7 A& O3 o  ^
        "(()+0x7ea5) [0x7fb551f87ea5]",
5 e+ Q: w3 F. K$ y        "(clone()+0x6d) [0x7fb550e4a9fd]"2 w  ~1 `' W  Z' l
    ], 4 U1 |1 J7 L) |5 w+ l$ u: f$ i
    "utsname_hostname": "compute08", * l% G$ B8 l) D6 Q
    "assert_msg": "/home/miles/rpmbuild/BUILD/ceph-14.2.8/src/osd/PG.cc: In function 'void PG::add_log_entry(const pg_log_entry_t&, bool)' thread 7fb52ad89700 time 2022-03-01 18:31:17.054438\n/home/miles/rpmbuild/BUILD/ceph-14.2.8/src/osd/PG.cc: 3964: FAILED ceph_assert(e.version > info.last_update)\n",
3 H) B. H" y; a2 [2 g. z    "crash_id": "2022-03-01_10:31:17.079004Z_11fa7732-990f-4166-8de5-943ff6f07c10", 3 s$ `% h) [5 Q# c8 P' L
    "assert_func": "void PG::add_log_entry(const pg_log_entry_t&, bool)",
" p$ e  |* I- M* G; l  {2 P4 H, W    "ceph_version": "14.2.8-111.el7"
2 @! C4 E5 S8 |4 i+ i. I}
' }9 X+ B, V) z! f# g# J3 P% w' O- z5 S2 o4 h2 f
# B: ]' }% n. k% Z7 S, a; P
[root@hostceph1 ~]# ceph crash archive 2022-03-01_10:31:17.079004Z_11fa7732-990f-4166-8de5-943ff6f07c10
' i9 Z! W' g, e! x4 c& b+ ]/ k/ x[root@hostceph1 ~]# ceph health detail
2 l: E* a6 g# P) u! Z: Q8 [+ F0 jHEALTH_OK
2 z* k/ g) b2 n5 S( v. \1 K
: T  U' L  N. n& x  O8 R+ b" p% k$ ^+ P

7 C" {& z- K* V解决完成。- n0 S' c% C3 p1 D; s$ Y* p
9 y( ~; p) C) G5 U( o, Y4 O
以下只是查看命令:7 w) |8 w4 u) K- D+ y6 @
[root@hostceph1 ~]# ceph config get  mgr/crash/warn_recent_interval 3 I  [( z$ r# S# Z7 }. S; @2 X$ n
Error EINVAL: unrecognized entity 'mgr/crash/warn_recent_interval', W4 C6 P/ i0 N
[root@hostceph1 ~]# ceph get mgr/crash/warn_recent_interval
% m3 X" N. m$ l' F1 {, h  mno valid command found; 10 closest matches:* e2 \$ u: u0 t" P6 q% c( E
osd pause
/ Q" ?" u9 T+ J$ C+ R1 Iosd unpause
" m! f0 z1 U% Z) M( t5 Y8 Josd get-require-min-compat-client  E; p5 P5 K( p' J8 a. G! S
osd set-require-min-compat-client <version> {--yes-i-really-mean-it}
2 w1 {1 u3 s$ `$ |0 c; D) cosd set-backfillfull-ratio <float[0.0-1.0]># G  W9 _9 k- a& K6 Q
osd set-nearfull-ratio <float[0.0-1.0]>0 ~! P: S6 k; _8 Q& y
mds count-metadata <property>3 N- O4 V- U  H. L
mds metadata {<who>}
5 y0 T' B' v3 N8 X: X: \4 {5 yfs dump {<int[0-]>}9 a0 u1 |0 X: u+ c9 `% B6 H
versions( Z2 v8 b7 Q. n$ T
Error EINVAL: invalid command
5 v  P% g) [: |& q[root@hostceph1 ~]# ceph config set mgr/crash/warn_recent_interval  0
+ V# e' E* r6 |- u0 d( rInvalid command: missing required parameter value(<string>). e; |; R- Z, f+ N& z
config set <who> <name> <value> {--force} :  Set a configuration option for one or more entities4 x3 E' Y4 u1 U  P9 w, q
Error EINVAL: invalid command1 y) v  @7 D' m7 C( c) b/ R+ b, O
[root@hostceph1 ~]# ceph crash archive-all
" M$ T. C2 l2 ~[root@hostceph1 ~]# ceph -s/ l; P" F9 s3 i  t4 j; b8 d0 e1 \2 E
  cluster:
; ?5 k2 M, f/ P    id:     29046cc0-0682-496b-98b1-912e59964282
' Q5 k9 E8 g% k' Q    health: HEALTH_OK0 q6 s. G9 P0 u& V8 C

* J  z5 r0 v! E1 b! q9 N) g- C  services:
, ~8 e% J" ^# i' K! j4 D" V    mon: 3 daemons, quorum hostceph1,hostceph2,hostceph3 (age 27m)
  D2 u4 k9 Z* b' O$ V1 B  A    mgr: hostceph1(active, since 53m), standbys: hostceph2, hostceph3
* t% @) A( E5 n9 e( ?5 e' n* W/ K    osd: 34 osds: 34 up (since 27m), 34 in (since 45m): o) ?. E* c& y8 }
& S: `. k' g  \) i' I1 ?! [
  data:
8 }) j% }8 Y" ^) P$ }    pools:   9 pools, 9344 pgs8 y6 E4 |& c, |. W" \# n4 l
    objects: 1.21M objects, 4.6 TiB4 w1 v& z1 R/ }& H
    usage:   16 TiB used, 110 TiB / 126 TiB avail3 ]5 @3 W7 H+ U' R; f& S# C! E' j) b
    pgs:     9344 active+clean
; s( a4 k  ~. x( N3 }8 P ) [! @; ?8 S, H
  io:; D: R8 W$ ^( n9 F( U3 M' J
    client:   2.7 KiB/s rd, 13 MiB/s wr, 0 op/s rd, 97 op/s wr
/ R' b  x# `6 y# S/ L; s. I0 w
您需要登录后才可以回帖 登录 | 注册

本版积分规则

返回首页|Archiver|手机版|小黑屋|易陆发现技术论坛 ( 蜀ICP备2026014127号-1 )

GMT+8, 2026-6-12 00:09 , Processed in 0.018204 second(s), 23 queries .

Powered by Discuz! X5.0

© 2001-2026 Discuz! Team.

快速回复 返回顶部 返回列表