找回密码
 注册
查看: 1316|回复: 0

HEALTH_WARN 1 daemons have recently crashed 解决过程

[复制链接]

1

主题

0

回帖

12

积分

管理员

积分
12
QQ
发表于 2022-3-1 19:01:12 | 显示全部楼层 |阅读模式
ceph 出现告警,解决流程:
# B7 m0 _  T* Y! X7 W* d( J[root@hostceph1 ~]# ceph health detail 0 s! F4 U9 v4 y3 @, d
HEALTH_WARN 1 daemons have recently crashed4 M( H5 m) b( q) ~' d" u* a
RECENT_CRASH 1 daemons have recently crashed
" Y  A/ d" O% D7 f    osd.29 crashed on host compute08 at 2022-03-01 10:31:17.079004Z0 m' h# [" r( K( k' x
, M4 o4 h2 B3 c% \3 Y( o
      
. s3 k; e0 k" E, G! y  O# s[root@hostceph1 ~]# ceph crash ls-new
# q/ u. h+ g0 p2 a+ `ID                                                               ENTITY NEW 9 F4 q0 V9 E6 I/ M
2022-03-01_10:31:17.079004Z_11fa7732-990f-4166-8de5-943ff6f07c10 osd.29  *  2 u$ \6 Q6 l7 t/ Y# L5 E* v
[root@hostceph1 ~]# ceph crash info  2022-03-01_10:31:17.079004Z_11fa7732-990f-4166-8de5-943ff6f07c10
6 q7 Z& z) u& D& Y/ Y{9 N( S; u9 ?- C( l/ p: D" X
    "os_version_id": "7", 7 V3 L; Y8 O! K1 y, N
    "assert_condition": "e.version > info.last_update", ) r# S$ r( I9 R4 h0 e" A
    "utsname_release": "3.10.0-1160.el7.x86_64",
, ^, ^" E( {% A; K3 O  o8 Z    "os_name": "CentOS Linux", " q  U1 Q( z% g% q7 @( ?7 h
    "entity_name": "osd.29", : H/ D$ {4 Z. u# n  i. ~7 l
    "assert_file": "/home/miles/rpmbuild/BUILD/ceph-14.2.8/src/osd/PG.cc", 7 S) B0 Q- N+ S% A$ _& B
    "timestamp": "2022-03-01 10:31:17.079004Z",
0 u4 J+ R; p2 I$ a& s) {0 k, O$ y7 f    "process_name": "ceph-osd",
& a% k# \+ X7 k0 P) A    "utsname_machine": "x86_64", ( E# A6 w1 e  @* D9 L  L
    "assert_line": 3964,
' o7 E1 C. [  N    "utsname_sysname": "Linux",
3 D" Y6 X8 `& |5 k# o- {2 V: p    "os_version": "7 (Core)",
" Y0 n3 x5 V/ ]6 ]1 c8 _    "os_id": "centos", 0 `0 N. _. B; {( a
    "assert_thread_name": "tp_osd_tp",
1 \% Q) c" W- Z/ x- W1 s9 k    "utsname_version": "#1 SMP Wed Nov 18 03:43:48 UTC 2020",
5 h% @* r: e! ]% H: K    "backtrace": [
$ ^9 k2 A5 S8 b( }+ [: ~        "(()+0xf630) [0x7fb551f8f630]",
8 E0 C, }2 K+ X$ t% v) Y        "(gsignal()+0x37) [0x7fb550d82387]",   F" A1 F% B7 z8 o/ R  ?
        "(abort()+0x148) [0x7fb550d83a78]",
* J. @6 T& p. v6 R0 B2 N  g5 X  R        "(ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x199) [0x55adc93aa704]",
. k& r! F; }4 y9 i: Q% `        "(()+0x4cc87d) [0x55adc93aa87d]",
# \% E' H4 x! g- E        "(PG::add_log_entry(pg_log_entry_t const&, bool)+0x1f5) [0x55adc953f3f5]", ! A1 H# |) p4 J  E* Q! O4 x
        "(PG::append_log(std::vector<pg_log_entry_t, std::allocator<pg_log_entry_t> > const&, eversion_t, eversion_t, ObjectStore::Transaction&, bool, bool)+0x10b) [0x55adc956f01b]", ! \  ^$ w# N! u4 K
        "(non-virtual thunk to PrimaryLogPG::log_operation(std::vector<pg_log_entry_t, std::allocator<pg_log_entry_t> > const&, boost::optional<pg_hit_set_history_t> const&, eversion_t const&, eversion_t const&, bool, ObjectStore::Transaction&, bool)+0x95) [0x55adc96598a5]",
# ~  H+ b7 s0 c0 O) Y. S+ |        "(ReplicatedBackend::do_repop(boost::intrusive_ptr<OpRequest>)+0xaa9) [0x55adc977a7a9]",
. v9 D8 `3 M' k# Q. Z) A$ i+ K; V        "(ReplicatedBackend::_handle_message(boost::intrusive_ptr<OpRequest>)+0x257) [0x55adc9788f57]",
! o, _$ ~  t) H* _7 L: R        "(PGBackend::handle_message(boost::intrusive_ptr<OpRequest>)+0x4a) [0x55adc9699dea]", ! }5 H4 t" K8 l+ ]/ _' f& K
        "(PrimaryLogPG::do_request(boost::intrusive_ptr<OpRequest>&, ThreadPool::TPHandle&)+0x5b3) [0x55adc964a1d3]", 0 D: S8 f5 A& m$ M9 i
        "(OSD::dequeue_op(boost::intrusive_ptr<PG>, boost::intrusive_ptr<OpRequest>, ThreadPool::TPHandle&)+0x362) [0x55adc948ab62]", 0 A4 p" Q6 p. F; n4 F
        "(PGOpItem::run(OSD*, OSDShard*, boost::intrusive_ptr<PG>&, ThreadPool::TPHandle&)+0x62) [0x55adc9719752]", 6 ]3 m( ]% a* z; p  A! x
        "(OSD::ShardedOpWQ::_process(unsigned int, ceph::heartbeat_handle_d*)+0x90f) [0x55adc94a5b5f]", % P* z' P: r3 R! Q4 _+ i; [; p5 @
        "(ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x5b6) [0x55adc9a49dd6]",   @3 ^1 }3 t. V8 O
        "(ShardedThreadPool::WorkThreadSharded::entry()+0x10) [0x55adc9a4c8f0]",
6 D, Z8 p& S; ^5 `- E* M4 {        "(()+0x7ea5) [0x7fb551f87ea5]", 7 S# C6 Z( q0 l7 N
        "(clone()+0x6d) [0x7fb550e4a9fd]"6 \3 @  H" D- K' D
    ],
6 }# Y7 p; F/ m    "utsname_hostname": "compute08",
# r; i, `6 i* F9 I4 \    "assert_msg": "/home/miles/rpmbuild/BUILD/ceph-14.2.8/src/osd/PG.cc: In function 'void PG::add_log_entry(const pg_log_entry_t&, bool)' thread 7fb52ad89700 time 2022-03-01 18:31:17.054438\n/home/miles/rpmbuild/BUILD/ceph-14.2.8/src/osd/PG.cc: 3964: FAILED ceph_assert(e.version > info.last_update)\n",
+ P6 a  W( k, Q9 J    "crash_id": "2022-03-01_10:31:17.079004Z_11fa7732-990f-4166-8de5-943ff6f07c10",
3 x: S( U+ L4 X& m% a- S    "assert_func": "void PG::add_log_entry(const pg_log_entry_t&, bool)",
+ I. o' f0 z! V" c    "ceph_version": "14.2.8-111.el7"
$ ~. S5 H8 b8 b8 C# B. R  R}8 N6 [" u; R2 k! B8 Z0 |- z
# y6 T8 ~; g: o& a3 s  r4 H( q; R
" Z' i* p# @8 C6 W' \0 z  w
[root@hostceph1 ~]# ceph crash archive 2022-03-01_10:31:17.079004Z_11fa7732-990f-4166-8de5-943ff6f07c106 J2 T1 q* U/ B
[root@hostceph1 ~]# ceph health detail
+ l1 L0 O# w; _+ a& A3 h7 PHEALTH_OK  j+ q# L0 v1 k* q$ i
: W( ~& Z1 J9 Z$ ~1 K+ G

- e8 u9 ^$ ]. N* g/ r$ c( F+ J( S' M. X4 a5 ]
解决完成。
! I; l. D' a4 S0 c/ u* w, r- C! m- h9 f3 o
以下只是查看命令:
) n) ?$ q- g7 }' C8 m+ e; Y/ _[root@hostceph1 ~]# ceph config get  mgr/crash/warn_recent_interval
) S# m% I6 ?$ P# P5 y% H$ eError EINVAL: unrecognized entity 'mgr/crash/warn_recent_interval'
. _# N4 ]9 s5 [0 j[root@hostceph1 ~]# ceph get mgr/crash/warn_recent_interval 0 C8 \  z9 `# I: j/ g& [
no valid command found; 10 closest matches:
# D9 C1 O, }/ r# {osd pause
+ v- e$ {4 Q/ w; K5 q, |8 vosd unpause
  Y0 N7 X7 W) aosd get-require-min-compat-client
, b( g1 @. C5 h0 d4 a- Sosd set-require-min-compat-client <version> {--yes-i-really-mean-it}2 E, x. A. R0 |4 a( i
osd set-backfillfull-ratio <float[0.0-1.0]>
6 m, B9 h6 n& C: H; sosd set-nearfull-ratio <float[0.0-1.0]>
' y; o+ Q+ Z9 p0 e, Gmds count-metadata <property>9 X5 i  n$ @5 _0 P
mds metadata {<who>}
5 }, J- e9 J5 @fs dump {<int[0-]>}
9 p9 N) O5 x7 w1 b' U" pversions
5 l. i3 c: ~2 j! [Error EINVAL: invalid command2 C4 D4 I5 z) J$ `% @5 U
[root@hostceph1 ~]# ceph config set mgr/crash/warn_recent_interval  0
  k3 N# @0 k9 {& X: JInvalid command: missing required parameter value(<string>)4 F/ _# z- M8 l) P+ C
config set <who> <name> <value> {--force} :  Set a configuration option for one or more entities
  p  X; S5 w. Q( W8 OError EINVAL: invalid command( i3 E8 u6 R' q+ m- P+ e
[root@hostceph1 ~]# ceph crash archive-all
; H& c- r* v. w" b+ x. w) w: m9 ]: C[root@hostceph1 ~]# ceph -s: Q9 A. s% D. u+ e( W/ ^$ D
  cluster:
9 F) S; s9 y. M    id:     29046cc0-0682-496b-98b1-912e59964282
# i' n: M6 T: S  Q: s% v    health: HEALTH_OK0 b, i9 L. Q: Q9 K) D1 l$ r5 }1 U! Z
# w$ w1 M- A5 `6 M' z
  services:7 j0 o* |7 [& Z# z0 |. v1 }/ _# w! c, W
    mon: 3 daemons, quorum hostceph1,hostceph2,hostceph3 (age 27m)7 v  `+ @0 v) b6 P5 h/ T2 `
    mgr: hostceph1(active, since 53m), standbys: hostceph2, hostceph3
" {! k6 i2 s6 L8 [    osd: 34 osds: 34 up (since 27m), 34 in (since 45m)
+ M" `$ Q; j1 d9 ]. v# a
5 @9 o7 i, P; d" G- [- P  data:1 }4 Z' K* {, Z9 o; Y- \
    pools:   9 pools, 9344 pgs# H% P9 S$ B7 r% w
    objects: 1.21M objects, 4.6 TiB
2 W5 Q, F9 Q7 a( {' p' f2 R    usage:   16 TiB used, 110 TiB / 126 TiB avail+ i. S) J% i4 Y  m3 Q2 R
    pgs:     9344 active+clean6 ?3 I5 R6 W$ _& d

4 {2 |/ v6 H+ G* n* Z  io:
3 W/ @. ]/ p$ _6 q+ F. k    client:   2.7 KiB/s rd, 13 MiB/s wr, 0 op/s rd, 97 op/s wr
1 k) e- {) s) @! [' ~# {6 |) P
您需要登录后才可以回帖 登录 | 注册

本版积分规则

返回首页|Archiver|手机版|小黑屋|易陆发现技术论坛 ( 蜀ICP备2026014127号-1 )

GMT+8, 2026-6-12 00:58 , Processed in 0.023900 second(s), 23 queries .

Powered by Discuz! X5.0

© 2001-2026 Discuz! Team.

快速回复 返回顶部 返回列表