ceph 分布式存储 ceph -s 提示1 daemons have recently crashed% R( Q/ A7 _, \; p; b
处理过程如下:5 m4 r1 {+ h8 _4 }& P
4 [; z, _) H1 Y( @7 m2 b+ D[root@compute02 ~]# ceph -s
+ Q% J, t k$ ^% \# i cluster:
9 M( `) `! o9 B3 _ id: dd1ff8b6-f7b8-47a3-890c-17f75894562a! W. d" w4 \7 e$ `! q
health: HEALTH_WARN
( ]# n/ Q( G- y/ Z3 {% S3 a# S/ V Reduced data availability: 171 pgs stale4 N8 X/ Y1 X/ r, W
1 daemons have recently crashed& w! s$ [: k" `
4 slow ops, oldest one blocked for 544 sec, osd.0 has slow ops, |& N( H) d; y1 x9 O8 D
; S8 J' ]' w4 l9 l services:
, P& C7 w3 b8 S, G4 T mon: 3 daemons, quorum compute01,compute02,compute03 (age 11m)5 W4 U, R: M' F$ `. x6 R- M) X1 \
mgr: compute02(active, since 4d), standbys: compute03, compute01. Z9 u( n1 j$ N% N
osd: 3 osds: 3 up (since 9m), 3 in (since 9m) e. n+ s; Y) I4 X
1 b! m4 r) |, Q5 _8 c data:
% p; S; Y9 k5 G9 } pools: 4 pools, 512 pgs) k3 r' C: r* P y
objects: 7.35k objects, 52 GiB
( t/ K. N1 L0 b* q usage: 35 GiB used, 1.6 TiB / 1.6 TiB avail k8 [/ ?( |9 E4 P
pgs: 341 active+clean
; G9 ]! Y, r% f9 D 171 stale+active+clean
4 N4 v/ C9 f( S; N
$ w2 ]- `& ~1 j5 t7 H/ Y[root@compute02 ~]# ceph crash ls-new* L. ?' i9 O) k* r) ]% w2 K0 M) m
ID ENTITY NEW # S' _8 [9 L3 G( d& E
2021-05-24_14:59:54.039272Z_69fc0f11-81bf-4428-aece-20a18f2b03e3 osd.0 *
' R) d( L& S; g- c( ^( s' Y[root@compute02 ~]# ceph crash info 2021-05-24_14:59:54.039272Z_69fc0f11-81bf-4428-aece-20a18f2b03e3
" t7 X+ c" r* U% M' N) w `{$ w2 F2 u4 z' u2 r/ p
"os_version_id": "7", ~$ T! C# M( \, C
"utsname_machine": "x86_64", 7 N6 V2 f( |6 F6 P4 F1 ]7 t
"entity_name": "osd.0", 7 f0 v" w( ]( n! i5 H. j1 f8 ]
"io_error": true,
; J9 B; B/ z* I5 {) f8 s "backtrace": [
: ~# o6 A- X, n5 \# Z) V; ]4 z "(()+0xf630) [0x7f22dcef7630]",
o8 d2 e8 S0 h5 I "(gsignal()+0x37) [0x7f22dbcea387]",
; \) T- a. j: O/ _ "(abort()+0x148) [0x7f22dbceba78]", + P7 S! o5 h2 [1 J5 n# s
"(ceph::__ceph_abort(char const*, int, char const*, std::string const&)+0x1a5) [0x56498eb2edfc]", 4 Z7 G5 ~9 X) x& l- t
"(KernelDevice::_aio_thread()+0xebe) [0x56498f17a1de]",
+ c3 u- s+ z1 }" ^0 p1 B "(KernelDevice::AioCompletionThread::entry()+0xd) [0x56498f17c89d]",
* }/ e* C/ E+ u1 U, K- t "(()+0x7ea5) [0x7f22dceefea5]", 8 T% Z3 p" h! i+ R
"(clone()+0x6d) [0x7f22dbdb29fd]"
) n' x' ?% @2 K& E ],
3 m5 T' u6 x( t6 ?) ? "io_error_optype": 8, ! u- `5 y2 s" W& K) m1 R' q: A5 o
"io_error_length": 4096,
" o/ V6 }: {- g8 P% m "assert_line": 534,
* s9 h# m1 d$ V6 s J: a "utsname_release": "3.10.0-1160.el7.x86_64",
0 Q8 Z7 M+ m; }. _1 i "io_error_offset": 288585248768,
& v) a2 [, U% V- Q! O' Q. y! H% i "assert_file": "/home/miles/rpmbuild/BUILD/ceph-14.2.8/src/os/bluestore/KernelDevice.cc",
j) C8 _8 W1 ~8 V# j4 |' U3 f( P "io_error_devname": "dm-2", ; x* r" R& y1 J- H
"utsname_sysname": "Linux", & a% E- F' S, w3 }4 P* v
"os_version": "7 (Core)",
9 ^# B" \- F0 i. R$ B "os_id": "centos", 8 r9 D8 y8 L5 {7 E3 V
"assert_thread_name": "bstore_aio",
' P) Y% X; L6 J+ F! z' K( R% X "assert_msg": "/home/miles/rpmbuild/BUILD/ceph-14.2.8/src/os/bluestore/KernelDevice.cc: In function 'void KernelDevice::_aio_thread()' thread 7f22d04a8700 time 2021-05-24 22:59:54.033676\n/home/miles/rpmbuild/BUILD/ceph-14.2.8/src/os/bluestore/KernelDevice.cc: 534: ceph_abort_msg(\"Unexpected IO error. This may suggest a hardware issue. Please check your kernel log!\")\n",
% e) F7 F( l, B- i "assert_func": "void KernelDevice::_aio_thread()",
" k( h4 J1 b$ j) o- E6 J "ceph_version": "14.2.8-111.el7", - Y9 M7 y- f( A# R+ l
"io_error_path": "/var/lib/ceph/osd/ceph-0/block",
/ ^5 q# ^$ B$ X* p: o2 ^# p; }3 T "os_name": "CentOS Linux", % h3 A8 f% p2 g+ a! p5 f
"timestamp": "2021-05-24 14:59:54.039272Z",
# ^: i2 z4 N+ [ @2 P* v "process_name": "ceph-osd",
' ]3 C# w7 }8 @* n8 z1 T" j "utsname_hostname": "compute01", ! H& d4 T; u6 P9 s4 C! P
"crash_id": "2021-05-24_14:59:54.039272Z_69fc0f11-81bf-4428-aece-20a18f2b03e3", * Y9 k C+ m- T1 _ }/ `
"assert_condition": "abort", : g/ _1 z, D$ ~9 E
"utsname_version": "#1 SMP Mon Oct 19 16:18:59 UTC 2020", 6 _. f6 J5 {8 [; f6 \5 {: M/ ~1 S& F
"io_error_code": -5
- S# d/ \; x2 P' s}* ~" \% U9 H& F* }
[root@compute02 ~]# ceph crash archive 2021-05-24_14:59:54.039272Z_69fc0f11-81bf-4428-aece-20a18f2b03e33 S) @, x. s# T, h1 s* ?
[root@compute02 ~]# ceph crash archive-all
9 ^+ p' q, L, A T& ]
9 M6 v- m# G- N" [- r. t$ {' L9 H2 }% |* @
' L& h/ U* Q6 J( Z[root@compute02 ~]# ceph -s' E+ X9 F' ^- u2 w
cluster:6 ]& o1 P7 U# f% Y! d& x4 S
id: dd1ff8b6-f7b8-47a3-890c-17f75894562a
; X% K. r& X, V0 B/ q- x3 A y health: HEALTH_WARN3 P; l8 n4 x3 Q9 G- E8 {% D9 W
Reduced data availability: 171 pgs stale, t. U2 |: N1 Z7 S+ z0 v7 I0 b6 d
4 slow ops, oldest one blocked for 738 sec, osd.0 has slow ops: ~9 M/ q- ~1 n: `' i
: j% ~) J8 x8 N8 P+ ?' b! a% B* \
services:
- N& Z5 B8 t. \ mon: 3 daemons, quorum compute01,compute02,compute03 (age 14m)9 S, C5 m X" y1 r4 r& i: {
mgr: compute02(active, since 4d), standbys: compute03, compute012 m+ j( M6 L4 j K* I$ H) w
osd: 3 osds: 3 up (since 12m), 3 in (since 12m)" P9 M3 n0 h. C5 m9 C
N8 g# k \& {0 V
data:
% ~: w6 g+ |) q8 L- } pools: 4 pools, 512 pgs
/ i0 Z1 t- X& p% D$ [- s+ a& t objects: 7.35k objects, 52 GiB
6 z2 R0 H# R9 T0 z# C/ |2 h' G usage: 35 GiB used, 1.6 TiB / 1.6 TiB avail
" ?' P+ n& A% W% g9 Q4 } pgs: 341 active+clean
7 Y% C7 |/ I6 C# [8 ` 171 stale+active+clean
- e# E- {2 z8 f4 F- Q; A5 z
) R8 D) a! {% A) y' _7 H6 x; R归档问题解决
; a0 E" ~2 f8 N$ y5 B5 v( V5 p0 G) H2 O) H/ t: ?/ k# s1 W$ r! i
借助相关处理介绍如下:: ^* E1 N$ W8 \0 l: N
使用ceph -s查看集群状态,发现一直有如下报错,且数量一直在增加 daemons have recently crashed
经查当前系统运行状态正常,判断这里显示的应该是历史故障,处理方式如下: 查看历史crash ceph crash ls-new根据ls出来的id查看详细信息 ceph crash info <crash-id>将历史crash信息进行归档,即不再显示 ceph crash archive <crash-id>归档所有信息 ceph crash archive-all
, N% t0 x# O: Z
, w8 o6 v7 a, JThe time period for what “recent” means is controlled by the option mgr/crash/warn_recent_interval (default: two weeks).These warnings can be disabled entirely with:#ceph config set mgr/crash/warn_recent_interval 03 y' M K9 ?# H6 x3 T7 J2 g! G
+ a, M6 P/ [' P9 e8 C$ v
|