ceph 分布式存储 ceph -s 提示1 daemons have recently crashed; `# L4 j# x2 f6 m, G
处理过程如下:# d8 ^: h, `9 g" Y* a. M
, K( K# q; p$ z' p3 F7 _
[root@compute02 ~]# ceph -s
* f3 U8 ^# X+ T. L7 F) S cluster:& q% z+ j9 b: k3 ~+ O
id: dd1ff8b6-f7b8-47a3-890c-17f75894562a
/ m- C' u3 C9 M+ K health: HEALTH_WARN5 w, L( {: F$ m
Reduced data availability: 171 pgs stale
5 A6 j- d: D& d) f$ b0 d 1 daemons have recently crashed( S* d: M1 ~+ V f; e- r- T; h
4 slow ops, oldest one blocked for 544 sec, osd.0 has slow ops
+ n4 ~1 u: t( q- s6 t: i' [- }7 M7 e$ n4 l
services:* L2 X, }4 L% T1 a+ E5 P5 ?5 ~7 u. f
mon: 3 daemons, quorum compute01,compute02,compute03 (age 11m)1 _: L+ E! H! B; O5 b( C/ u
mgr: compute02(active, since 4d), standbys: compute03, compute01
3 E- m2 g& |) x, }1 n3 [( D7 E osd: 3 osds: 3 up (since 9m), 3 in (since 9m)0 \0 ~! R2 A/ n3 \& Q7 q
! J$ d) c+ |9 b: N3 e, h
data:+ S6 t$ b& y1 o% R6 J/ L9 V
pools: 4 pools, 512 pgs0 v7 [0 |0 c6 H: t7 {
objects: 7.35k objects, 52 GiB
4 D% h/ O$ @% K usage: 35 GiB used, 1.6 TiB / 1.6 TiB avail
9 i" Z; _/ r% _6 f+ n8 b1 B) P pgs: 341 active+clean
0 h4 m' Z( M8 a) j# } 171 stale+active+clean6 z8 X# i+ G1 S2 m3 Y
6 A5 l" Q4 Y/ i) b2 c6 j[root@compute02 ~]# ceph crash ls-new/ [) W0 U! `. M9 T
ID ENTITY NEW
4 l; |* x: ]9 n: B( L2021-05-24_14:59:54.039272Z_69fc0f11-81bf-4428-aece-20a18f2b03e3 osd.0 *
8 C- T$ ]: I. K[root@compute02 ~]# ceph crash info 2021-05-24_14:59:54.039272Z_69fc0f11-81bf-4428-aece-20a18f2b03e3+ K' p0 _# f8 d/ L; k
{0 Z& j/ E9 K! }3 o
"os_version_id": "7", , T' w& D9 a8 J$ N9 h4 N, d7 [& m
"utsname_machine": "x86_64",
/ }) F& b4 J& D5 d/ s" | "entity_name": "osd.0",
5 T F( I# u8 h o "io_error": true, % s: u1 ]7 w! g2 L7 {# q3 J
"backtrace": [/ p7 k; }( B* z
"(()+0xf630) [0x7f22dcef7630]", ! p9 ^3 Z4 q- z- i9 S( V% x* m) ~
"(gsignal()+0x37) [0x7f22dbcea387]", - |. {+ M# \7 d* c2 ^# V
"(abort()+0x148) [0x7f22dbceba78]", 5 f* a2 D* ]0 F0 y% u$ i
"(ceph::__ceph_abort(char const*, int, char const*, std::string const&)+0x1a5) [0x56498eb2edfc]",
( s j; a, B# y; M8 q. }3 i "(KernelDevice::_aio_thread()+0xebe) [0x56498f17a1de]",
" C( q# b* ~; Z$ g* F6 ] "(KernelDevice::AioCompletionThread::entry()+0xd) [0x56498f17c89d]",
$ N$ p7 ]! Q" f2 v "(()+0x7ea5) [0x7f22dceefea5]", 6 i+ B0 w8 l; _
"(clone()+0x6d) [0x7f22dbdb29fd]". m+ g8 h7 f1 J# g- Q4 G
],
6 N0 d& _; Z, ~ "io_error_optype": 8, 4 U& M4 i5 u9 W9 `0 a
"io_error_length": 4096, " I7 O3 v, T% c! Q. G: i* |. K+ r; m
"assert_line": 534, 9 }$ k* V9 @, X0 U3 A& o* O
"utsname_release": "3.10.0-1160.el7.x86_64", - }" a. y* W: o9 {# q3 k- `% m# E! i; Q
"io_error_offset": 288585248768,
" \4 c5 O& L/ s& e "assert_file": "/home/miles/rpmbuild/BUILD/ceph-14.2.8/src/os/bluestore/KernelDevice.cc",
( Y( [; e5 X, B/ P! u2 S. l* y- G "io_error_devname": "dm-2",
& s0 s& B$ d2 T "utsname_sysname": "Linux", - q( ?- u% V# f7 }& J1 b6 y! V
"os_version": "7 (Core)",
1 |, \4 H6 h3 B, T+ |* D "os_id": "centos",
5 Y8 [+ `" s s5 i5 g" W. f* u "assert_thread_name": "bstore_aio", ! K/ f# K# ^9 P5 T3 I: ~
"assert_msg": "/home/miles/rpmbuild/BUILD/ceph-14.2.8/src/os/bluestore/KernelDevice.cc: In function 'void KernelDevice::_aio_thread()' thread 7f22d04a8700 time 2021-05-24 22:59:54.033676\n/home/miles/rpmbuild/BUILD/ceph-14.2.8/src/os/bluestore/KernelDevice.cc: 534: ceph_abort_msg(\"Unexpected IO error. This may suggest a hardware issue. Please check your kernel log!\")\n", 7 `% ^: u0 G% f3 R. ?# O8 Z
"assert_func": "void KernelDevice::_aio_thread()", # ^- _3 E) J1 g$ ~$ _' n
"ceph_version": "14.2.8-111.el7",
- r, g% d) q$ @ k "io_error_path": "/var/lib/ceph/osd/ceph-0/block", ' ^ @# E# c% J0 b @
"os_name": "CentOS Linux", 8 J. L5 T& _( \$ K5 h; D! n
"timestamp": "2021-05-24 14:59:54.039272Z",
, ?0 T9 f+ m: T. p* y' c "process_name": "ceph-osd",
7 f, P- W2 w1 k" Q9 T "utsname_hostname": "compute01",
( h! @/ u- v) K# @ "crash_id": "2021-05-24_14:59:54.039272Z_69fc0f11-81bf-4428-aece-20a18f2b03e3",
- ?# N+ H" K+ B+ m3 p "assert_condition": "abort",
9 m9 G2 d" s' D: q4 G+ s "utsname_version": "#1 SMP Mon Oct 19 16:18:59 UTC 2020", " I7 T# I8 K! i2 m+ |
"io_error_code": -5) l, V) o( e% ?; e q9 j
}- m5 u' r! ]5 [! A: O
[root@compute02 ~]# ceph crash archive 2021-05-24_14:59:54.039272Z_69fc0f11-81bf-4428-aece-20a18f2b03e38 K- m9 b: b" |: b
[root@compute02 ~]# ceph crash archive-all
; q. f8 ^% |: x2 }' G
# n3 Y N% e$ }$ H7 I0 w' e
& B7 m- ^9 @& l2 a% ~3 S3 E* K+ D% v% z' y- _: V2 m; s$ C
[root@compute02 ~]# ceph -s
" c! D. v6 e; W cluster:/ q9 r4 @7 U M/ V
id: dd1ff8b6-f7b8-47a3-890c-17f75894562a
5 |* D; ~& K$ H9 e! R% X health: HEALTH_WARN# D ]- G! \2 b% r! Y; o
Reduced data availability: 171 pgs stale1 } {- t _ e" G
4 slow ops, oldest one blocked for 738 sec, osd.0 has slow ops
: x4 }1 y. ~3 ]6 W# u5 [- ?7 C' Q) _ R. A: j, l
services:
. k% C! F) V7 Y: \ mon: 3 daemons, quorum compute01,compute02,compute03 (age 14m)( s: H8 H n- m0 `
mgr: compute02(active, since 4d), standbys: compute03, compute01
$ }; i% H# Z& t; u+ ]8 y osd: 3 osds: 3 up (since 12m), 3 in (since 12m)' x; j# t, v' [. J
1 _6 a# N7 T+ ]' q, I6 u
data:
: M3 l9 x" L: O" M8 n( m" h* @ pools: 4 pools, 512 pgs
/ _: z; Z5 I2 \) W objects: 7.35k objects, 52 GiB
. D6 y9 T# V9 s$ M! i usage: 35 GiB used, 1.6 TiB / 1.6 TiB avail
( k; O' A! t- h0 F pgs: 341 active+clean
3 {& K" y6 ]' c: t3 U 171 stale+active+clean
0 O6 z# v) p, t0 l
) y9 I6 s& _3 J0 @, k' X& E1 N归档问题解决% O5 r* M( i7 c: _% p
3 F- q- k& |& G4 g
借助相关处理介绍如下:
6 |" y6 ?' U* N( {* N使用ceph -s查看集群状态,发现一直有如下报错,且数量一直在增加 daemons have recently crashed
经查当前系统运行状态正常,判断这里显示的应该是历史故障,处理方式如下: 查看历史crash ceph crash ls-new根据ls出来的id查看详细信息 ceph crash info <crash-id>将历史crash信息进行归档,即不再显示 ceph crash archive <crash-id>归档所有信息 ceph crash archive-all
0 L6 Q5 i% }7 e4 N: A9 u+ v' ?
8 D+ j0 W5 C3 i: J/ h4 G. T1 @. ZThe time period for what “recent” means is controlled by the option mgr/crash/warn_recent_interval (default: two weeks).These warnings can be disabled entirely with:#ceph config set mgr/crash/warn_recent_interval 0
; N' M% s( F; g! ^& y" Y- b6 ?# z1 U
% V- L0 g- _' o: F+ b |