找回密码
 注册
查看: 4478|回复: 0

ceph 分布式存储 ceph -s 提示1 daemons have recently crashed

[复制链接]

1

主题

0

回帖

12

积分

管理员

积分
12
QQ
发表于 2021-5-25 09:51:14 | 显示全部楼层 |阅读模式
ceph 分布式存储 ceph -s 提示1 daemons have recently crashed% R( Q/ A7 _, \; p; b
处理过程如下:5 m4 r1 {+ h8 _4 }& P

4 [; z, _) H1 Y( @7 m2 b+ D[root@compute02 ~]# ceph -s
+ Q% J, t  k$ ^% \# i  cluster:
9 M( `) `! o9 B3 _    id:     dd1ff8b6-f7b8-47a3-890c-17f75894562a! W. d" w4 \7 e$ `! q
    health: HEALTH_WARN
( ]# n/ Q( G- y/ Z3 {% S3 a# S/ V            Reduced data availability: 171 pgs stale4 N8 X/ Y1 X/ r, W
            1 daemons have recently crashed& w! s$ [: k" `
            4 slow ops, oldest one blocked for 544 sec, osd.0 has slow ops, |& N( H) d; y1 x9 O8 D

; S8 J' ]' w4 l9 l  services:
, P& C7 w3 b8 S, G4 T    mon: 3 daemons, quorum compute01,compute02,compute03 (age 11m)5 W4 U, R: M' F$ `. x6 R- M) X1 \
    mgr: compute02(active, since 4d), standbys: compute03, compute01. Z9 u( n1 j$ N% N
    osd: 3 osds: 3 up (since 9m), 3 in (since 9m)  e. n+ s; Y) I4 X

1 b! m4 r) |, Q5 _8 c  data:
% p; S; Y9 k5 G9 }    pools:   4 pools, 512 pgs) k3 r' C: r* P  y
    objects: 7.35k objects, 52 GiB
( t/ K. N1 L0 b* q    usage:   35 GiB used, 1.6 TiB / 1.6 TiB avail  k8 [/ ?( |9 E4 P
    pgs:     341 active+clean
; G9 ]! Y, r% f9 D             171 stale+active+clean
4 N4 v/ C9 f( S; N
$ w2 ]- `& ~1 j5 t7 H/ Y[root@compute02 ~]# ceph crash ls-new* L. ?' i9 O) k* r) ]% w2 K0 M) m
ID                                                               ENTITY NEW # S' _8 [9 L3 G( d& E
2021-05-24_14:59:54.039272Z_69fc0f11-81bf-4428-aece-20a18f2b03e3 osd.0   *  
' R) d( L& S; g- c( ^( s' Y[root@compute02 ~]# ceph crash info 2021-05-24_14:59:54.039272Z_69fc0f11-81bf-4428-aece-20a18f2b03e3
" t7 X+ c" r* U% M' N) w  `{$ w2 F2 u4 z' u2 r/ p
    "os_version_id": "7",   ~$ T! C# M( \, C
    "utsname_machine": "x86_64", 7 N6 V2 f( |6 F6 P4 F1 ]7 t
    "entity_name": "osd.0", 7 f0 v" w( ]( n! i5 H. j1 f8 ]
    "io_error": true,
; J9 B; B/ z* I5 {) f8 s    "backtrace": [
: ~# o6 A- X, n5 \# Z) V; ]4 z        "(()+0xf630) [0x7f22dcef7630]",
  o8 d2 e8 S0 h5 I        "(gsignal()+0x37) [0x7f22dbcea387]",
; \) T- a. j: O/ _        "(abort()+0x148) [0x7f22dbceba78]", + P7 S! o5 h2 [1 J5 n# s
        "(ceph::__ceph_abort(char const*, int, char const*, std::string const&)+0x1a5) [0x56498eb2edfc]", 4 Z7 G5 ~9 X) x& l- t
        "(KernelDevice::_aio_thread()+0xebe) [0x56498f17a1de]",
+ c3 u- s+ z1 }" ^0 p1 B        "(KernelDevice::AioCompletionThread::entry()+0xd) [0x56498f17c89d]",
* }/ e* C/ E+ u1 U, K- t        "(()+0x7ea5) [0x7f22dceefea5]", 8 T% Z3 p" h! i+ R
        "(clone()+0x6d) [0x7f22dbdb29fd]"
) n' x' ?% @2 K& E    ],
3 m5 T' u6 x( t6 ?) ?    "io_error_optype": 8, ! u- `5 y2 s" W& K) m1 R' q: A5 o
    "io_error_length": 4096,
" o/ V6 }: {- g8 P% m    "assert_line": 534,
* s9 h# m1 d$ V6 s  J: a    "utsname_release": "3.10.0-1160.el7.x86_64",
0 Q8 Z7 M+ m; }. _1 i    "io_error_offset": 288585248768,
& v) a2 [, U% V- Q! O' Q. y! H% i    "assert_file": "/home/miles/rpmbuild/BUILD/ceph-14.2.8/src/os/bluestore/KernelDevice.cc",
  j) C8 _8 W1 ~8 V# j4 |' U3 f( P    "io_error_devname": "dm-2", ; x* r" R& y1 J- H
    "utsname_sysname": "Linux", & a% E- F' S, w3 }4 P* v
    "os_version": "7 (Core)",
9 ^# B" \- F0 i. R$ B    "os_id": "centos", 8 r9 D8 y8 L5 {7 E3 V
    "assert_thread_name": "bstore_aio",
' P) Y% X; L6 J+ F! z' K( R% X    "assert_msg": "/home/miles/rpmbuild/BUILD/ceph-14.2.8/src/os/bluestore/KernelDevice.cc: In function 'void KernelDevice::_aio_thread()' thread 7f22d04a8700 time 2021-05-24 22:59:54.033676\n/home/miles/rpmbuild/BUILD/ceph-14.2.8/src/os/bluestore/KernelDevice.cc: 534: ceph_abort_msg(\"Unexpected IO error. This may suggest a hardware issue. Please check your kernel log!\")\n",
% e) F7 F( l, B- i    "assert_func": "void KernelDevice::_aio_thread()",
" k( h4 J1 b$ j) o- E6 J    "ceph_version": "14.2.8-111.el7", - Y9 M7 y- f( A# R+ l
    "io_error_path": "/var/lib/ceph/osd/ceph-0/block",
/ ^5 q# ^$ B$ X* p: o2 ^# p; }3 T    "os_name": "CentOS Linux", % h3 A8 f% p2 g+ a! p5 f
    "timestamp": "2021-05-24 14:59:54.039272Z",
# ^: i2 z4 N+ [  @2 P* v    "process_name": "ceph-osd",
' ]3 C# w7 }8 @* n8 z1 T" j    "utsname_hostname": "compute01", ! H& d4 T; u6 P9 s4 C! P
    "crash_id": "2021-05-24_14:59:54.039272Z_69fc0f11-81bf-4428-aece-20a18f2b03e3", * Y9 k  C+ m- T1 _  }/ `
    "assert_condition": "abort", : g/ _1 z, D$ ~9 E
    "utsname_version": "#1 SMP Mon Oct 19 16:18:59 UTC 2020", 6 _. f6 J5 {8 [; f6 \5 {: M/ ~1 S& F
    "io_error_code": -5
- S# d/ \; x2 P' s}* ~" \% U9 H& F* }
[root@compute02 ~]# ceph crash  archive 2021-05-24_14:59:54.039272Z_69fc0f11-81bf-4428-aece-20a18f2b03e33 S) @, x. s# T, h1 s* ?
[root@compute02 ~]# ceph crash archive-all
9 ^+ p' q, L, A  T& ]
9 M6 v- m# G- N" [- r. t$ {' L9 H2 }% |* @

' L& h/ U* Q6 J( Z[root@compute02 ~]# ceph -s' E+ X9 F' ^- u2 w
  cluster:6 ]& o1 P7 U# f% Y! d& x4 S
    id:     dd1ff8b6-f7b8-47a3-890c-17f75894562a
; X% K. r& X, V0 B/ q- x3 A  y    health: HEALTH_WARN3 P; l8 n4 x3 Q9 G- E8 {% D9 W
            Reduced data availability: 171 pgs stale, t. U2 |: N1 Z7 S+ z0 v7 I0 b6 d
            4 slow ops, oldest one blocked for 738 sec, osd.0 has slow ops: ~9 M/ q- ~1 n: `' i
: j% ~) J8 x8 N8 P+ ?' b! a% B* \
  services:
- N& Z5 B8 t. \    mon: 3 daemons, quorum compute01,compute02,compute03 (age 14m)9 S, C5 m  X" y1 r4 r& i: {
    mgr: compute02(active, since 4d), standbys: compute03, compute012 m+ j( M6 L4 j  K* I$ H) w
    osd: 3 osds: 3 up (since 12m), 3 in (since 12m)" P9 M3 n0 h. C5 m9 C
  N8 g# k  \& {0 V
  data:
% ~: w6 g+ |) q8 L- }    pools:   4 pools, 512 pgs
/ i0 Z1 t- X& p% D$ [- s+ a& t    objects: 7.35k objects, 52 GiB
6 z2 R0 H# R9 T0 z# C/ |2 h' G    usage:   35 GiB used, 1.6 TiB / 1.6 TiB avail
" ?' P+ n& A% W% g9 Q4 }    pgs:     341 active+clean
7 Y% C7 |/ I6 C# [8 `             171 stale+active+clean
- e# E- {2 z8 f4 F- Q; A5 z
) R8 D) a! {% A) y' _7 H6 x; R归档问题解决
; a0 E" ~2 f8 N$ y5 B5 v( V5 p0 G) H2 O) H/ t: ?/ k# s1 W$ r! i
借助相关处理介绍如下:: ^* E1 N$ W8 \0 l: N
使用ceph -s查看集群状态,发现一直有如下报错,且数量一直在增加
daemons have recently crashed
经查当前系统运行状态正常,判断这里显示的应该是历史故障,处理方式如下:
查看历史crash
ceph crash ls-new
根据ls出来的id查看详细信息
ceph crash info <crash-id>
将历史crash信息进行归档,即不再显示
ceph crash archive <crash-id>
归档所有信息
ceph crash archive-all
, N% t0 x# O: Z
, w8 o6 v7 a, JThe time period for what “recent” means is controlled by the option mgr/crash/warn_recent_interval (default: two weeks).These warnings can be disabled entirely with:#ceph config set mgr/crash/warn_recent_interval 03 y' M  K9 ?# H6 x3 T7 J2 g! G
+ a, M6 P/ [' P9 e8 C$ v
您需要登录后才可以回帖 登录 | 注册

本版积分规则

返回首页|Archiver|手机版|小黑屋|易陆发现技术论坛 ( 蜀ICP备2026014127号-1 )

GMT+8, 2026-6-12 00:58 , Processed in 0.018518 second(s), 23 queries .

Powered by Discuz! X5.0

© 2001-2026 Discuz! Team.

快速回复 返回顶部 返回列表