ceph 分布式存储 ceph -s 提示1 daemons have recently crashed
7 d% l1 B0 {# Z% f; @+ s7 d3 Z处理过程如下:
* k( X/ E) _+ | @5 d& y: E" Y6 Q9 d, p* d) `2 p) ]" h
[root@compute02 ~]# ceph -s0 S2 c9 i( T' Y
cluster:
0 _4 z" p9 Q& K6 z# U& C @' L5 l id: dd1ff8b6-f7b8-47a3-890c-17f75894562a( D" ?: Z3 T$ x5 {+ Q$ x
health: HEALTH_WARN
4 q# U, s+ |/ @+ C$ D3 Q Reduced data availability: 171 pgs stale
9 J6 f( V! A& p6 A8 j0 F3 Y 1 daemons have recently crashed) A# @- ^+ J3 P3 Z
4 slow ops, oldest one blocked for 544 sec, osd.0 has slow ops
+ h/ h7 w' u8 k F) D V+ A& {- _- b" J D/ X( U
services:' v I$ a. N- s8 e
mon: 3 daemons, quorum compute01,compute02,compute03 (age 11m)
9 S! f) c- [+ `. z5 o- I9 K# t mgr: compute02(active, since 4d), standbys: compute03, compute017 g T9 ?2 \# o; ~
osd: 3 osds: 3 up (since 9m), 3 in (since 9m). Z$ F5 B- G( K6 a j T1 ?
+ ]" f# Y- Q, U
data:1 ?5 z" z& C# a+ W- l4 A
pools: 4 pools, 512 pgs
1 ]; \" g, p; |/ {+ V objects: 7.35k objects, 52 GiB
9 G g8 o) v7 I u: A usage: 35 GiB used, 1.6 TiB / 1.6 TiB avail
( ], ^8 ?4 O- V pgs: 341 active+clean/ S {) Q2 ^2 r! t6 v7 b' T% U
171 stale+active+clean
" l; Y' _ e# k2 P+ y8 w6 Z' L" H3 U9 B# h+ y/ Q# P2 L- e
[root@compute02 ~]# ceph crash ls-new
8 {/ q2 X' o8 h* k P" tID ENTITY NEW 1 x; N: A7 q+ F6 u, u
2021-05-24_14:59:54.039272Z_69fc0f11-81bf-4428-aece-20a18f2b03e3 osd.0 * 2 Y+ \5 t0 `/ f5 I+ ?
[root@compute02 ~]# ceph crash info 2021-05-24_14:59:54.039272Z_69fc0f11-81bf-4428-aece-20a18f2b03e3! j9 z+ D! L+ X% l5 O* _ D# f
{4 [, x( Z* b$ E0 S7 J8 `0 I8 ^
"os_version_id": "7",
) C$ v% c4 h$ L( U" X$ c "utsname_machine": "x86_64",
6 K: Z! k4 l" l! ^ "entity_name": "osd.0", 8 P5 G4 m7 ^; U" @4 Y
"io_error": true,
8 b0 y6 m3 i& M& Y. F "backtrace": [% H7 U. K* @0 n; N/ I2 M0 D0 t
"(()+0xf630) [0x7f22dcef7630]",
1 x* ?: q$ E1 H% |4 [; R7 y. c "(gsignal()+0x37) [0x7f22dbcea387]", / q# s6 T" m3 W
"(abort()+0x148) [0x7f22dbceba78]",
) I+ b5 R0 a0 `$ I "(ceph::__ceph_abort(char const*, int, char const*, std::string const&)+0x1a5) [0x56498eb2edfc]",
w# b- D1 j7 K/ U. W) v "(KernelDevice::_aio_thread()+0xebe) [0x56498f17a1de]", $ r% | s2 M) ]) S0 c4 f
"(KernelDevice::AioCompletionThread::entry()+0xd) [0x56498f17c89d]", 4 B6 s9 W; A: {: }. W
"(()+0x7ea5) [0x7f22dceefea5]",
- M+ R' v% O# n' j$ j" }" h "(clone()+0x6d) [0x7f22dbdb29fd]"
7 a1 K f& u- I2 W s ],
- P# ?8 |& ~. \% l) D "io_error_optype": 8, 6 V ]' y; G) x
"io_error_length": 4096,
/ r* P0 o* u( Q4 m. w "assert_line": 534,
) M# g: [( j8 p9 Z "utsname_release": "3.10.0-1160.el7.x86_64",
n: V; y. i& c4 d" n "io_error_offset": 288585248768, 3 e$ s8 l' a+ A. U1 K2 @. O
"assert_file": "/home/miles/rpmbuild/BUILD/ceph-14.2.8/src/os/bluestore/KernelDevice.cc",
' A9 ?! E( ?8 m6 g; p6 Q5 ` "io_error_devname": "dm-2",
/ ^) ?: l/ [1 Y# P "utsname_sysname": "Linux",
4 ~/ E: Q: q; N0 F "os_version": "7 (Core)",
8 v/ R. m4 F; | "os_id": "centos", 4 Q, i; [6 S9 U, T
"assert_thread_name": "bstore_aio", & P. N3 B1 L1 S+ E* g8 A! i
"assert_msg": "/home/miles/rpmbuild/BUILD/ceph-14.2.8/src/os/bluestore/KernelDevice.cc: In function 'void KernelDevice::_aio_thread()' thread 7f22d04a8700 time 2021-05-24 22:59:54.033676\n/home/miles/rpmbuild/BUILD/ceph-14.2.8/src/os/bluestore/KernelDevice.cc: 534: ceph_abort_msg(\"Unexpected IO error. This may suggest a hardware issue. Please check your kernel log!\")\n",
. E( V+ J( W9 x "assert_func": "void KernelDevice::_aio_thread()", : s; r2 b: a8 U+ \: O# V2 W; s
"ceph_version": "14.2.8-111.el7",
( p; @& R+ A! `9 W9 ~( ]- }8 U) x "io_error_path": "/var/lib/ceph/osd/ceph-0/block",
) k7 j3 J" F& N4 e; D% _ "os_name": "CentOS Linux", ! ? x2 j1 q. X4 _, I
"timestamp": "2021-05-24 14:59:54.039272Z",
: w0 \6 T- C; i1 _) Z' t; @ "process_name": "ceph-osd", . j3 Y( k Q8 T i
"utsname_hostname": "compute01", 2 _( {# P/ T5 G: K* _+ w
"crash_id": "2021-05-24_14:59:54.039272Z_69fc0f11-81bf-4428-aece-20a18f2b03e3", 1 ~- B ^2 C; e) b
"assert_condition": "abort", # E3 O1 k* c, u# D
"utsname_version": "#1 SMP Mon Oct 19 16:18:59 UTC 2020", - i' _- i, w, q. O: Y. \
"io_error_code": -5
5 J/ J8 {; k, |$ K6 {}' `7 r7 v9 V1 D* X
[root@compute02 ~]# ceph crash archive 2021-05-24_14:59:54.039272Z_69fc0f11-81bf-4428-aece-20a18f2b03e30 X5 s; r2 v: S: t- o- H, [$ j
[root@compute02 ~]# ceph crash archive-all
0 Q6 r: m& P* o" Z4 { u. } ?/ |2 K7 l
~/ x, Q. {5 U; K- q5 V3 S' A
0 W7 H* c% ?0 r9 L# h4 _( K0 ^[root@compute02 ~]# ceph -s# O6 h) l% t8 K5 ]) S
cluster:
& h- z$ X2 D1 V1 z! ~- g E id: dd1ff8b6-f7b8-47a3-890c-17f75894562a
5 _: h4 h( H/ ` health: HEALTH_WARN
3 U, ?# n0 b% R% V( i. z Reduced data availability: 171 pgs stale
' t* ~6 h, ^/ l e 4 slow ops, oldest one blocked for 738 sec, osd.0 has slow ops
0 t, F6 Z x9 W' z- `0 Y
4 r! _- _/ J" B3 R4 D$ K services:
0 `" Y# L* E$ u1 ?, m mon: 3 daemons, quorum compute01,compute02,compute03 (age 14m)
+ R# c8 j6 ?4 |- g- d6 _ mgr: compute02(active, since 4d), standbys: compute03, compute013 D+ x1 O; @3 e( E& f( F
osd: 3 osds: 3 up (since 12m), 3 in (since 12m) q' M7 X- s% l
$ R3 |; X: _9 B9 j
data:$ R4 N' Q+ h' P, W3 m
pools: 4 pools, 512 pgs
' w* s' r- M: U7 `& r. M& K objects: 7.35k objects, 52 GiB7 q. u6 r. W0 }/ g2 D6 w7 H; _
usage: 35 GiB used, 1.6 TiB / 1.6 TiB avail3 g: Y2 z5 I' ^
pgs: 341 active+clean0 [1 |! F9 n' d& A
171 stale+active+clean Y) A7 O5 @4 w4 |' m1 s
; ] c0 m5 W) m* a6 R4 N" w
归档问题解决5 D7 [" V& t8 v, p% H
! K+ i! O: q* G3 Z, d) o) }
借助相关处理介绍如下:8 Y7 ? W/ Q' ], k" a& M
使用ceph -s查看集群状态,发现一直有如下报错,且数量一直在增加 daemons have recently crashed
经查当前系统运行状态正常,判断这里显示的应该是历史故障,处理方式如下: 查看历史crash ceph crash ls-new根据ls出来的id查看详细信息 ceph crash info <crash-id>将历史crash信息进行归档,即不再显示 ceph crash archive <crash-id>归档所有信息 ceph crash archive-all
6 C* x# R W& z3 @
! ^- b+ Y% s, ^1 w6 ^ l+ DThe time period for what “recent” means is controlled by the option mgr/crash/warn_recent_interval (default: two weeks).These warnings can be disabled entirely with:#ceph config set mgr/crash/warn_recent_interval 0
) o% L( z7 v/ x7 ]" i4 `1 C* s% \6 w; G; u) Y- o8 O# N& k1 x5 Q
|