ceph 分布式存储 ceph -s 提示1 daemons have recently crashed1 Y8 l) k* {6 W/ r, ^8 `
处理过程如下:$ w: ~. O/ E3 l6 A" l0 e8 R& P
7 i4 ?9 C3 d# M. z: @. X[root@compute02 ~]# ceph -s/ l: H) p3 Y }5 O) H- ^& Q2 i
cluster:
$ G2 k h. y4 G id: dd1ff8b6-f7b8-47a3-890c-17f75894562a
1 |5 s0 K, y9 k: |4 v1 r) ^# i" X y health: HEALTH_WARN* Z5 _! F4 J6 r
Reduced data availability: 171 pgs stale
; l' g8 @0 p+ m, r 1 daemons have recently crashed
1 E' `/ m7 u. ~; o6 U. o 4 slow ops, oldest one blocked for 544 sec, osd.0 has slow ops r" Y( x: }! Q9 r$ d O' T( F& f- A. n
! i" c6 c; N% C4 \
services:
! o. Q/ S2 w4 a9 }0 p0 Z mon: 3 daemons, quorum compute01,compute02,compute03 (age 11m)& P# Y5 `' t) C
mgr: compute02(active, since 4d), standbys: compute03, compute014 n' u4 l& t# B, H4 O6 g
osd: 3 osds: 3 up (since 9m), 3 in (since 9m)$ j& J% M5 L0 L7 ]( f/ X9 ^6 V
5 d* Y* [- Z }6 ^6 S* P/ ~ ?* [9 ] data:# Z! x# m8 m( V, |: j5 }
pools: 4 pools, 512 pgs
) w$ o) a E M( [# Z objects: 7.35k objects, 52 GiB
6 h g( G& \9 a8 T, J usage: 35 GiB used, 1.6 TiB / 1.6 TiB avail; K, i5 O% N( e: E' j0 i
pgs: 341 active+clean
9 r. w2 L4 m7 I7 I 171 stale+active+clean# B' [# r, Z& h8 @" T
4 d8 [! s9 }. g \
[root@compute02 ~]# ceph crash ls-new1 N. ~1 M K% A# @1 Q1 r
ID ENTITY NEW
) _: D8 K/ @6 t- x' p2021-05-24_14:59:54.039272Z_69fc0f11-81bf-4428-aece-20a18f2b03e3 osd.0 * ) y9 b/ v1 n% l$ q+ F$ w7 E
[root@compute02 ~]# ceph crash info 2021-05-24_14:59:54.039272Z_69fc0f11-81bf-4428-aece-20a18f2b03e3+ `' Q h; S1 E; S7 e5 `3 d( K
{
" q+ V, w6 V3 Z "os_version_id": "7", " W0 [0 t& Q+ V
"utsname_machine": "x86_64",
0 S( Z$ H1 \9 I( q* q "entity_name": "osd.0", + A# e5 R8 m& g4 S! S7 m
"io_error": true, . O6 [3 }" h$ {1 M6 S
"backtrace": [& H4 g& Q. z8 v
"(()+0xf630) [0x7f22dcef7630]", + E1 e$ U3 n$ g/ Z0 b
"(gsignal()+0x37) [0x7f22dbcea387]", ! [# F* o5 J5 f3 S% g; g7 Y* H3 m
"(abort()+0x148) [0x7f22dbceba78]",
- z b) d3 ]3 d. `$ i "(ceph::__ceph_abort(char const*, int, char const*, std::string const&)+0x1a5) [0x56498eb2edfc]", " Y9 q, Q! t* b+ S# ?0 N
"(KernelDevice::_aio_thread()+0xebe) [0x56498f17a1de]", 4 U% t2 ^% @, g/ L
"(KernelDevice::AioCompletionThread::entry()+0xd) [0x56498f17c89d]",
. o$ y( V% B, T$ k; g$ I# l1 J0 g "(()+0x7ea5) [0x7f22dceefea5]", . t1 Y. Q+ }+ q$ Q/ W, o! J9 i* X
"(clone()+0x6d) [0x7f22dbdb29fd]"9 y8 m1 j% F2 v5 Z6 g8 Y
], ( O' K) `) L. Y4 J6 @0 l9 D$ T; E
"io_error_optype": 8, 4 a& ?& P9 ~1 L! g$ B! ]
"io_error_length": 4096,
9 O/ {- l+ u/ q# ^5 D9 { "assert_line": 534,
+ m* n. ]( R5 {! _ "utsname_release": "3.10.0-1160.el7.x86_64",
9 Q& P9 K2 k" H2 Y "io_error_offset": 288585248768, 3 T$ s/ O* C9 u) m
"assert_file": "/home/miles/rpmbuild/BUILD/ceph-14.2.8/src/os/bluestore/KernelDevice.cc", 9 F' U9 b- v9 y1 q
"io_error_devname": "dm-2", % W8 M% u/ o# V9 n, Z$ G
"utsname_sysname": "Linux",
. I* E; y$ u- ~4 t: V0 N) C "os_version": "7 (Core)",
$ u0 V' l% M8 _4 G" |! D; \ "os_id": "centos", 0 C1 v. ~6 }9 R. o
"assert_thread_name": "bstore_aio", ' l) H8 t& r$ k6 k6 _+ l0 }- a
"assert_msg": "/home/miles/rpmbuild/BUILD/ceph-14.2.8/src/os/bluestore/KernelDevice.cc: In function 'void KernelDevice::_aio_thread()' thread 7f22d04a8700 time 2021-05-24 22:59:54.033676\n/home/miles/rpmbuild/BUILD/ceph-14.2.8/src/os/bluestore/KernelDevice.cc: 534: ceph_abort_msg(\"Unexpected IO error. This may suggest a hardware issue. Please check your kernel log!\")\n",
- s; W0 e; o, ?0 r/ ~% e @" m/ Y& e "assert_func": "void KernelDevice::_aio_thread()", ( }1 Z: ^2 R" X, C$ c: i: ~5 }
"ceph_version": "14.2.8-111.el7", ; g/ P9 s' A1 v4 A6 ?$ g/ G$ l
"io_error_path": "/var/lib/ceph/osd/ceph-0/block",
/ h7 c* l4 n. D' m "os_name": "CentOS Linux", ( t# x' Z1 b$ z5 h1 N4 y* ~
"timestamp": "2021-05-24 14:59:54.039272Z", ; S+ Y7 ]6 g( |5 D6 I3 G3 {
"process_name": "ceph-osd",
- `7 ~# N- C6 \$ \) V "utsname_hostname": "compute01",
& N9 V3 g/ f% c5 u "crash_id": "2021-05-24_14:59:54.039272Z_69fc0f11-81bf-4428-aece-20a18f2b03e3", / X% G4 T: W. x! s+ @4 m
"assert_condition": "abort",
( I2 ~" ^% V( V- Z "utsname_version": "#1 SMP Mon Oct 19 16:18:59 UTC 2020", 8 H6 L( E1 I9 H& D+ ]3 s2 G
"io_error_code": -5
( F; x2 S" {+ u6 t! o T1 o/ `+ ~5 r% L}
( R7 i. G& A6 A8 t5 Z[root@compute02 ~]# ceph crash archive 2021-05-24_14:59:54.039272Z_69fc0f11-81bf-4428-aece-20a18f2b03e3) o" F0 ]" o# D: [
[root@compute02 ~]# ceph crash archive-all 6 g! n& t. H) a% g* B* V) c8 g
' p* i: U' |0 ~5 H5 \! g5 E* E2 [9 i
' t6 R% z) R2 K6 G- i# s; ]
: [8 p0 I6 V& j, i3 Y[root@compute02 ~]# ceph -s
8 b' y( t+ _6 }" |2 A2 V+ k1 k( y cluster:0 ~# q% Y, @9 e0 ^+ Y* e
id: dd1ff8b6-f7b8-47a3-890c-17f75894562a
. B% B8 X: s8 p6 h health: HEALTH_WARN4 G9 F0 w) S- w
Reduced data availability: 171 pgs stale/ j4 ^+ ]! h2 k& ?9 X, y
4 slow ops, oldest one blocked for 738 sec, osd.0 has slow ops
7 F7 M9 W8 g3 O
6 G, C a) p. \* R+ P ^- }! m services:# v& [) J4 V; V" P% l
mon: 3 daemons, quorum compute01,compute02,compute03 (age 14m)
, |6 f$ K& ~/ ]$ r V mgr: compute02(active, since 4d), standbys: compute03, compute01
9 r* v7 Y/ t' D$ \# v osd: 3 osds: 3 up (since 12m), 3 in (since 12m)) m3 s5 x, o) U6 g
) @0 Q; y2 v' a; }, s/ Y ~2 X
data:
; M0 D+ D* ~$ C6 E; | pools: 4 pools, 512 pgs
4 M( P2 z `/ K" V objects: 7.35k objects, 52 GiB( v* W7 n/ e* ?- v9 k, a
usage: 35 GiB used, 1.6 TiB / 1.6 TiB avail
( d. p/ t7 o4 a- {% Z9 T* R pgs: 341 active+clean
! Q' M9 x: B+ S. ] M 171 stale+active+clean6 l; T( w" Y0 ^: ]& X/ R; r
, e3 ]# M9 n- l归档问题解决
0 n. k5 u1 y. \ H H, |
1 ^( f3 h% A" [; X7 f& D2 R+ y借助相关处理介绍如下:
2 K$ }& d9 W3 |5 q使用ceph -s查看集群状态,发现一直有如下报错,且数量一直在增加 daemons have recently crashed
经查当前系统运行状态正常,判断这里显示的应该是历史故障,处理方式如下: 查看历史crash ceph crash ls-new根据ls出来的id查看详细信息 ceph crash info <crash-id>将历史crash信息进行归档,即不再显示 ceph crash archive <crash-id>归档所有信息 ceph crash archive-all0 G/ I. q' l% D! N) c, K
+ r- F& p' z* i1 x4 d, K+ a% |The time period for what “recent” means is controlled by the option mgr/crash/warn_recent_interval (default: two weeks).These warnings can be disabled entirely with:#ceph config set mgr/crash/warn_recent_interval 0& Q7 I* m, D8 ^) K' \( b5 X. l
$ ?, ]7 C- E# W% \7 K |