找回密码
 注册
查看: 4479|回复: 0

ceph 分布式存储 ceph -s 提示1 daemons have recently crashed

[复制链接]

1

主题

0

回帖

12

积分

管理员

积分
12
QQ
发表于 2021-5-25 09:51:14 | 显示全部楼层 |阅读模式
ceph 分布式存储 ceph -s 提示1 daemons have recently crashed; `# L4 j# x2 f6 m, G
处理过程如下:# d8 ^: h, `9 g" Y* a. M
, K( K# q; p$ z' p3 F7 _
[root@compute02 ~]# ceph -s
* f3 U8 ^# X+ T. L7 F) S  cluster:& q% z+ j9 b: k3 ~+ O
    id:     dd1ff8b6-f7b8-47a3-890c-17f75894562a
/ m- C' u3 C9 M+ K    health: HEALTH_WARN5 w, L( {: F$ m
            Reduced data availability: 171 pgs stale
5 A6 j- d: D& d) f$ b0 d            1 daemons have recently crashed( S* d: M1 ~+ V  f; e- r- T; h
            4 slow ops, oldest one blocked for 544 sec, osd.0 has slow ops
+ n4 ~1 u: t( q- s6 t: i' [- }7 M7 e$ n4 l
  services:* L2 X, }4 L% T1 a+ E5 P5 ?5 ~7 u. f
    mon: 3 daemons, quorum compute01,compute02,compute03 (age 11m)1 _: L+ E! H! B; O5 b( C/ u
    mgr: compute02(active, since 4d), standbys: compute03, compute01
3 E- m2 g& |) x, }1 n3 [( D7 E    osd: 3 osds: 3 up (since 9m), 3 in (since 9m)0 \0 ~! R2 A/ n3 \& Q7 q
! J$ d) c+ |9 b: N3 e, h
  data:+ S6 t$ b& y1 o% R6 J/ L9 V
    pools:   4 pools, 512 pgs0 v7 [0 |0 c6 H: t7 {
    objects: 7.35k objects, 52 GiB
4 D% h/ O$ @% K    usage:   35 GiB used, 1.6 TiB / 1.6 TiB avail
9 i" Z; _/ r% _6 f+ n8 b1 B) P    pgs:     341 active+clean
0 h4 m' Z( M8 a) j# }             171 stale+active+clean6 z8 X# i+ G1 S2 m3 Y

6 A5 l" Q4 Y/ i) b2 c6 j[root@compute02 ~]# ceph crash ls-new/ [) W0 U! `. M9 T
ID                                                               ENTITY NEW
4 l; |* x: ]9 n: B( L2021-05-24_14:59:54.039272Z_69fc0f11-81bf-4428-aece-20a18f2b03e3 osd.0   *  
8 C- T$ ]: I. K[root@compute02 ~]# ceph crash info 2021-05-24_14:59:54.039272Z_69fc0f11-81bf-4428-aece-20a18f2b03e3+ K' p0 _# f8 d/ L; k
{0 Z& j/ E9 K! }3 o
    "os_version_id": "7", , T' w& D9 a8 J$ N9 h4 N, d7 [& m
    "utsname_machine": "x86_64",
/ }) F& b4 J& D5 d/ s" |    "entity_name": "osd.0",
5 T  F( I# u8 h  o    "io_error": true, % s: u1 ]7 w! g2 L7 {# q3 J
    "backtrace": [/ p7 k; }( B* z
        "(()+0xf630) [0x7f22dcef7630]", ! p9 ^3 Z4 q- z- i9 S( V% x* m) ~
        "(gsignal()+0x37) [0x7f22dbcea387]", - |. {+ M# \7 d* c2 ^# V
        "(abort()+0x148) [0x7f22dbceba78]", 5 f* a2 D* ]0 F0 y% u$ i
        "(ceph::__ceph_abort(char const*, int, char const*, std::string const&)+0x1a5) [0x56498eb2edfc]",
( s  j; a, B# y; M8 q. }3 i        "(KernelDevice::_aio_thread()+0xebe) [0x56498f17a1de]",
" C( q# b* ~; Z$ g* F6 ]        "(KernelDevice::AioCompletionThread::entry()+0xd) [0x56498f17c89d]",
$ N$ p7 ]! Q" f2 v        "(()+0x7ea5) [0x7f22dceefea5]", 6 i+ B0 w8 l; _
        "(clone()+0x6d) [0x7f22dbdb29fd]". m+ g8 h7 f1 J# g- Q4 G
    ],
6 N0 d& _; Z, ~    "io_error_optype": 8, 4 U& M4 i5 u9 W9 `0 a
    "io_error_length": 4096, " I7 O3 v, T% c! Q. G: i* |. K+ r; m
    "assert_line": 534, 9 }$ k* V9 @, X0 U3 A& o* O
    "utsname_release": "3.10.0-1160.el7.x86_64", - }" a. y* W: o9 {# q3 k- `% m# E! i; Q
    "io_error_offset": 288585248768,
" \4 c5 O& L/ s& e    "assert_file": "/home/miles/rpmbuild/BUILD/ceph-14.2.8/src/os/bluestore/KernelDevice.cc",
( Y( [; e5 X, B/ P! u2 S. l* y- G    "io_error_devname": "dm-2",
& s0 s& B$ d2 T    "utsname_sysname": "Linux", - q( ?- u% V# f7 }& J1 b6 y! V
    "os_version": "7 (Core)",
1 |, \4 H6 h3 B, T+ |* D    "os_id": "centos",
5 Y8 [+ `" s  s5 i5 g" W. f* u    "assert_thread_name": "bstore_aio", ! K/ f# K# ^9 P5 T3 I: ~
    "assert_msg": "/home/miles/rpmbuild/BUILD/ceph-14.2.8/src/os/bluestore/KernelDevice.cc: In function 'void KernelDevice::_aio_thread()' thread 7f22d04a8700 time 2021-05-24 22:59:54.033676\n/home/miles/rpmbuild/BUILD/ceph-14.2.8/src/os/bluestore/KernelDevice.cc: 534: ceph_abort_msg(\"Unexpected IO error. This may suggest a hardware issue. Please check your kernel log!\")\n", 7 `% ^: u0 G% f3 R. ?# O8 Z
    "assert_func": "void KernelDevice::_aio_thread()", # ^- _3 E) J1 g$ ~$ _' n
    "ceph_version": "14.2.8-111.el7",
- r, g% d) q$ @  k    "io_error_path": "/var/lib/ceph/osd/ceph-0/block", ' ^  @# E# c% J0 b  @
    "os_name": "CentOS Linux", 8 J. L5 T& _( \$ K5 h; D! n
    "timestamp": "2021-05-24 14:59:54.039272Z",
, ?0 T9 f+ m: T. p* y' c    "process_name": "ceph-osd",
7 f, P- W2 w1 k" Q9 T    "utsname_hostname": "compute01",
( h! @/ u- v) K# @    "crash_id": "2021-05-24_14:59:54.039272Z_69fc0f11-81bf-4428-aece-20a18f2b03e3",
- ?# N+ H" K+ B+ m3 p    "assert_condition": "abort",
9 m9 G2 d" s' D: q4 G+ s    "utsname_version": "#1 SMP Mon Oct 19 16:18:59 UTC 2020", " I7 T# I8 K! i2 m+ |
    "io_error_code": -5) l, V) o( e% ?; e  q9 j
}- m5 u' r! ]5 [! A: O
[root@compute02 ~]# ceph crash  archive 2021-05-24_14:59:54.039272Z_69fc0f11-81bf-4428-aece-20a18f2b03e38 K- m9 b: b" |: b
[root@compute02 ~]# ceph crash archive-all
; q. f8 ^% |: x2 }' G
# n3 Y  N% e$ }$ H7 I0 w' e
& B7 m- ^9 @& l2 a% ~3 S3 E* K+ D% v% z' y- _: V2 m; s$ C
[root@compute02 ~]# ceph -s
" c! D. v6 e; W  cluster:/ q9 r4 @7 U  M/ V
    id:     dd1ff8b6-f7b8-47a3-890c-17f75894562a
5 |* D; ~& K$ H9 e! R% X    health: HEALTH_WARN# D  ]- G! \2 b% r! Y; o
            Reduced data availability: 171 pgs stale1 }  {- t  _  e" G
            4 slow ops, oldest one blocked for 738 sec, osd.0 has slow ops
: x4 }1 y. ~3 ]6 W# u5 [- ?7 C' Q) _  R. A: j, l
  services:
. k% C! F) V7 Y: \    mon: 3 daemons, quorum compute01,compute02,compute03 (age 14m)( s: H8 H  n- m0 `
    mgr: compute02(active, since 4d), standbys: compute03, compute01
$ }; i% H# Z& t; u+ ]8 y    osd: 3 osds: 3 up (since 12m), 3 in (since 12m)' x; j# t, v' [. J
1 _6 a# N7 T+ ]' q, I6 u
  data:
: M3 l9 x" L: O" M8 n( m" h* @    pools:   4 pools, 512 pgs
/ _: z; Z5 I2 \) W    objects: 7.35k objects, 52 GiB
. D6 y9 T# V9 s$ M! i    usage:   35 GiB used, 1.6 TiB / 1.6 TiB avail
( k; O' A! t- h0 F    pgs:     341 active+clean
3 {& K" y6 ]' c: t3 U             171 stale+active+clean
0 O6 z# v) p, t0 l
) y9 I6 s& _3 J0 @, k' X& E1 N归档问题解决% O5 r* M( i7 c: _% p
3 F- q- k& |& G4 g
借助相关处理介绍如下:
6 |" y6 ?' U* N( {* N
使用ceph -s查看集群状态,发现一直有如下报错,且数量一直在增加
daemons have recently crashed
经查当前系统运行状态正常,判断这里显示的应该是历史故障,处理方式如下:
查看历史crash
ceph crash ls-new
根据ls出来的id查看详细信息
ceph crash info <crash-id>
将历史crash信息进行归档,即不再显示
ceph crash archive <crash-id>
归档所有信息
ceph crash archive-all
0 L6 Q5 i% }7 e4 N: A9 u+ v' ?
8 D+ j0 W5 C3 i: J/ h4 G. T1 @. ZThe time period for what “recent” means is controlled by the option mgr/crash/warn_recent_interval (default: two weeks).These warnings can be disabled entirely with:#ceph config set mgr/crash/warn_recent_interval 0
; N' M% s( F; g! ^& y" Y- b6 ?# z1 U

% V- L0 g- _' o: F+ b
您需要登录后才可以回帖 登录 | 注册

本版积分规则

返回首页|Archiver|手机版|小黑屋|易陆发现技术论坛 ( 蜀ICP备2026014127号-1 )

GMT+8, 2026-6-12 01:01 , Processed in 0.016918 second(s), 23 queries .

Powered by Discuz! X5.0

© 2001-2026 Discuz! Team.

快速回复 返回顶部 返回列表