找回密码
 注册
查看: 4480|回复: 0

ceph 分布式存储 ceph -s 提示1 daemons have recently crashed

[复制链接]

1

主题

0

回帖

12

积分

管理员

积分
12
QQ
发表于 2021-5-25 09:51:14 | 显示全部楼层 |阅读模式
ceph 分布式存储 ceph -s 提示1 daemons have recently crashed1 Y8 l) k* {6 W/ r, ^8 `
处理过程如下:$ w: ~. O/ E3 l6 A" l0 e8 R& P

7 i4 ?9 C3 d# M. z: @. X[root@compute02 ~]# ceph -s/ l: H) p3 Y  }5 O) H- ^& Q2 i
  cluster:
$ G2 k  h. y4 G    id:     dd1ff8b6-f7b8-47a3-890c-17f75894562a
1 |5 s0 K, y9 k: |4 v1 r) ^# i" X  y    health: HEALTH_WARN* Z5 _! F4 J6 r
            Reduced data availability: 171 pgs stale
; l' g8 @0 p+ m, r            1 daemons have recently crashed
1 E' `/ m7 u. ~; o6 U. o            4 slow ops, oldest one blocked for 544 sec, osd.0 has slow ops  r" Y( x: }! Q9 r$ d  O' T( F& f- A. n
! i" c6 c; N% C4 \
  services:
! o. Q/ S2 w4 a9 }0 p0 Z    mon: 3 daemons, quorum compute01,compute02,compute03 (age 11m)& P# Y5 `' t) C
    mgr: compute02(active, since 4d), standbys: compute03, compute014 n' u4 l& t# B, H4 O6 g
    osd: 3 osds: 3 up (since 9m), 3 in (since 9m)$ j& J% M5 L0 L7 ]( f/ X9 ^6 V

5 d* Y* [- Z  }6 ^6 S* P/ ~  ?* [9 ]  data:# Z! x# m8 m( V, |: j5 }
    pools:   4 pools, 512 pgs
) w$ o) a  E  M( [# Z    objects: 7.35k objects, 52 GiB
6 h  g( G& \9 a8 T, J    usage:   35 GiB used, 1.6 TiB / 1.6 TiB avail; K, i5 O% N( e: E' j0 i
    pgs:     341 active+clean
9 r. w2 L4 m7 I7 I             171 stale+active+clean# B' [# r, Z& h8 @" T
4 d8 [! s9 }. g  \
[root@compute02 ~]# ceph crash ls-new1 N. ~1 M  K% A# @1 Q1 r
ID                                                               ENTITY NEW
) _: D8 K/ @6 t- x' p2021-05-24_14:59:54.039272Z_69fc0f11-81bf-4428-aece-20a18f2b03e3 osd.0   *  ) y9 b/ v1 n% l$ q+ F$ w7 E
[root@compute02 ~]# ceph crash info 2021-05-24_14:59:54.039272Z_69fc0f11-81bf-4428-aece-20a18f2b03e3+ `' Q  h; S1 E; S7 e5 `3 d( K
{
" q+ V, w6 V3 Z    "os_version_id": "7", " W0 [0 t& Q+ V
    "utsname_machine": "x86_64",
0 S( Z$ H1 \9 I( q* q    "entity_name": "osd.0", + A# e5 R8 m& g4 S! S7 m
    "io_error": true, . O6 [3 }" h$ {1 M6 S
    "backtrace": [& H4 g& Q. z8 v
        "(()+0xf630) [0x7f22dcef7630]", + E1 e$ U3 n$ g/ Z0 b
        "(gsignal()+0x37) [0x7f22dbcea387]", ! [# F* o5 J5 f3 S% g; g7 Y* H3 m
        "(abort()+0x148) [0x7f22dbceba78]",
- z  b) d3 ]3 d. `$ i        "(ceph::__ceph_abort(char const*, int, char const*, std::string const&)+0x1a5) [0x56498eb2edfc]", " Y9 q, Q! t* b+ S# ?0 N
        "(KernelDevice::_aio_thread()+0xebe) [0x56498f17a1de]", 4 U% t2 ^% @, g/ L
        "(KernelDevice::AioCompletionThread::entry()+0xd) [0x56498f17c89d]",
. o$ y( V% B, T$ k; g$ I# l1 J0 g        "(()+0x7ea5) [0x7f22dceefea5]", . t1 Y. Q+ }+ q$ Q/ W, o! J9 i* X
        "(clone()+0x6d) [0x7f22dbdb29fd]"9 y8 m1 j% F2 v5 Z6 g8 Y
    ], ( O' K) `) L. Y4 J6 @0 l9 D$ T; E
    "io_error_optype": 8, 4 a& ?& P9 ~1 L! g$ B! ]
    "io_error_length": 4096,
9 O/ {- l+ u/ q# ^5 D9 {    "assert_line": 534,
+ m* n. ]( R5 {! _    "utsname_release": "3.10.0-1160.el7.x86_64",
9 Q& P9 K2 k" H2 Y    "io_error_offset": 288585248768, 3 T$ s/ O* C9 u) m
    "assert_file": "/home/miles/rpmbuild/BUILD/ceph-14.2.8/src/os/bluestore/KernelDevice.cc", 9 F' U9 b- v9 y1 q
    "io_error_devname": "dm-2", % W8 M% u/ o# V9 n, Z$ G
    "utsname_sysname": "Linux",
. I* E; y$ u- ~4 t: V0 N) C    "os_version": "7 (Core)",
$ u0 V' l% M8 _4 G" |! D; \    "os_id": "centos", 0 C1 v. ~6 }9 R. o
    "assert_thread_name": "bstore_aio", ' l) H8 t& r$ k6 k6 _+ l0 }- a
    "assert_msg": "/home/miles/rpmbuild/BUILD/ceph-14.2.8/src/os/bluestore/KernelDevice.cc: In function 'void KernelDevice::_aio_thread()' thread 7f22d04a8700 time 2021-05-24 22:59:54.033676\n/home/miles/rpmbuild/BUILD/ceph-14.2.8/src/os/bluestore/KernelDevice.cc: 534: ceph_abort_msg(\"Unexpected IO error. This may suggest a hardware issue. Please check your kernel log!\")\n",
- s; W0 e; o, ?0 r/ ~% e  @" m/ Y& e    "assert_func": "void KernelDevice::_aio_thread()", ( }1 Z: ^2 R" X, C$ c: i: ~5 }
    "ceph_version": "14.2.8-111.el7", ; g/ P9 s' A1 v4 A6 ?$ g/ G$ l
    "io_error_path": "/var/lib/ceph/osd/ceph-0/block",
/ h7 c* l4 n. D' m    "os_name": "CentOS Linux", ( t# x' Z1 b$ z5 h1 N4 y* ~
    "timestamp": "2021-05-24 14:59:54.039272Z", ; S+ Y7 ]6 g( |5 D6 I3 G3 {
    "process_name": "ceph-osd",
- `7 ~# N- C6 \$ \) V    "utsname_hostname": "compute01",
& N9 V3 g/ f% c5 u    "crash_id": "2021-05-24_14:59:54.039272Z_69fc0f11-81bf-4428-aece-20a18f2b03e3", / X% G4 T: W. x! s+ @4 m
    "assert_condition": "abort",
( I2 ~" ^% V( V- Z    "utsname_version": "#1 SMP Mon Oct 19 16:18:59 UTC 2020", 8 H6 L( E1 I9 H& D+ ]3 s2 G
    "io_error_code": -5
( F; x2 S" {+ u6 t! o  T1 o/ `+ ~5 r% L}
( R7 i. G& A6 A8 t5 Z[root@compute02 ~]# ceph crash  archive 2021-05-24_14:59:54.039272Z_69fc0f11-81bf-4428-aece-20a18f2b03e3) o" F0 ]" o# D: [
[root@compute02 ~]# ceph crash archive-all 6 g! n& t. H) a% g* B* V) c8 g
' p* i: U' |0 ~5 H5 \! g5 E* E2 [9 i
' t6 R% z) R2 K6 G- i# s; ]

: [8 p0 I6 V& j, i3 Y[root@compute02 ~]# ceph -s
8 b' y( t+ _6 }" |2 A2 V+ k1 k( y  cluster:0 ~# q% Y, @9 e0 ^+ Y* e
    id:     dd1ff8b6-f7b8-47a3-890c-17f75894562a
. B% B8 X: s8 p6 h    health: HEALTH_WARN4 G9 F0 w) S- w
            Reduced data availability: 171 pgs stale/ j4 ^+ ]! h2 k& ?9 X, y
            4 slow ops, oldest one blocked for 738 sec, osd.0 has slow ops
7 F7 M9 W8 g3 O
6 G, C  a) p. \* R+ P  ^- }! m  services:# v& [) J4 V; V" P% l
    mon: 3 daemons, quorum compute01,compute02,compute03 (age 14m)
, |6 f$ K& ~/ ]$ r  V    mgr: compute02(active, since 4d), standbys: compute03, compute01
9 r* v7 Y/ t' D$ \# v    osd: 3 osds: 3 up (since 12m), 3 in (since 12m)) m3 s5 x, o) U6 g
) @0 Q; y2 v' a; }, s/ Y  ~2 X
  data:
; M0 D+ D* ~$ C6 E; |    pools:   4 pools, 512 pgs
4 M( P2 z  `/ K" V    objects: 7.35k objects, 52 GiB( v* W7 n/ e* ?- v9 k, a
    usage:   35 GiB used, 1.6 TiB / 1.6 TiB avail
( d. p/ t7 o4 a- {% Z9 T* R    pgs:     341 active+clean
! Q' M9 x: B+ S. ]  M             171 stale+active+clean6 l; T( w" Y0 ^: ]& X/ R; r

, e3 ]# M9 n- l归档问题解决
0 n. k5 u1 y. \  H  H, |
1 ^( f3 h% A" [; X7 f& D2 R+ y借助相关处理介绍如下:
2 K$ }& d9 W3 |5 q
使用ceph -s查看集群状态,发现一直有如下报错,且数量一直在增加
daemons have recently crashed
经查当前系统运行状态正常,判断这里显示的应该是历史故障,处理方式如下:
查看历史crash
ceph crash ls-new
根据ls出来的id查看详细信息
ceph crash info <crash-id>
将历史crash信息进行归档,即不再显示
ceph crash archive <crash-id>
归档所有信息
ceph crash archive-all0 G/ I. q' l% D! N) c, K

+ r- F& p' z* i1 x4 d, K+ a% |The time period for what “recent” means is controlled by the option mgr/crash/warn_recent_interval (default: two weeks).These warnings can be disabled entirely with:#ceph config set mgr/crash/warn_recent_interval 0& Q7 I* m, D8 ^) K' \( b5 X. l

$ ?, ]7 C- E# W% \7 K
您需要登录后才可以回帖 登录 | 注册

本版积分规则

返回首页|Archiver|手机版|小黑屋|易陆发现技术论坛 ( 蜀ICP备2026014127号-1 )

GMT+8, 2026-6-12 01:05 , Processed in 0.020003 second(s), 23 queries .

Powered by Discuz! X5.0

© 2001-2026 Discuz! Team.

快速回复 返回顶部 返回列表