找回密码
 注册
查看: 560|回复: 0

HEALTH_ERR 1 scrub errors Possible data damage: 1 pg inconsistent 处理过程并恢复

[复制链接]

1

主题

0

回帖

12

积分

管理员

积分
12
QQ
发表于 2022-9-16 15:21:58 | 显示全部楼层 |阅读模式
[root@compute01 ~]# ceph -s
9 V3 ?8 x, U4 P7 }6 g- M  cluster:
1 H. l7 L8 U% Y- x% v  F  H1 b    id:     2af51d38-db90-4a57-a43d-ea9f6ebd7482& o( Q5 i& w! j
    health: HEALTH_ERR
9 o+ D& ?' ~" F5 G9 u5 E            1 scrub errors
4 v5 l3 `4 s3 y$ Z            Possible data damage: 1 pg inconsistent
4 U0 t* |& I* v1 `0 h- O+ @+ r            1 slow ops, oldest one blocked for 51555 sec, mon.compute01 has slow ops
* V1 [5 U5 |% q, t' U  ]* J! h/ A7 b, J* R0 D5 c+ q
  services:. G. E- d/ M# m$ N. R
    mon: 5 daemons, quorum compute01,compute02,compute03,compute05,compute08 (age 14h)3 B# \* F* Y# B
    mgr: compute03(active, since 4M), standbys: compute02, compute01, compute08, compute05
1 A; \" {$ t- {; t    mds:  1 up:standby
0 K' k5 ^- `( D$ B9 {( p+ k    osd: 32 osds: 32 up (since 14h), 32 in (since 4M)
+ q: f0 E" l8 s3 Y: Y& m! A( {1 n7 {6 B) g; }4 {4 H
  data:. }1 s! ?0 G$ R8 X( \( N4 W
    pools:   7 pools, 3712 pgs/ d, O0 w9 H% i+ L& j
    objects: 1.88M objects, 7.2 TiB6 d, O5 \% t. [/ R9 D3 x, Q1 C. \
    usage:   14 TiB used, 129 TiB / 144 TiB avail
, P" \' ~% B- `' t    pgs:     3709 active+clean
+ ~+ J1 [& x& S             2    active+clean+scrubbing+deep
0 x4 W+ g* u; w; {  ~4 t             1    active+clean+inconsistent
# ^0 _6 U" k% b" [8 m
) m* C! u- e2 W% Q; Q  io:
$ _7 _- {  C" b; Z    client:   1.2 MiB/s rd, 7.3 MiB/s wr, 1.54k op/s rd, 533 op/s wr8 A. `: w- h* |: F* x+ Q
. Z2 L! f% L3 f' T9 l
查看状态:
! k, k1 F) {8 |9 ]
' h& s3 H" [9 e" g$ M[root@compute01 ~]# ceph health detail
# g6 S. c, l+ Z8 i' ]# cHEALTH_ERR 1 scrub errors; Possible data damage: 1 pg inconsistent; 1 slow ops, oldest one blocked for 51565 sec, mon.compute01 has slow ops
. n) }& p8 \* N4 V0 z" mOSD_SCRUB_ERRORS 1 scrub errors" a% A, L) s( ^  h# S/ W
PG_DAMAGED Possible data damage: 1 pg inconsistent
1 x. u6 _1 Y) W: |: g. t! ]: @) H    pg 9.167 is active+clean+inconsistent, acting [9,11]- U& Q# }& o+ M% V5 e# f' m" X
SLOW_OPS 1 slow ops, oldest one blocked for 51565 sec, mon.compute01 has slow ops
0 h# H8 g4 @* M
$ T$ ?2 l5 j5 x' z修复pg
& b1 D. B, V0 u8 L1 ^2 F' u" E[root@compute01 ~]# ceph pg repair 9.167& E9 x. n, c3 Y
instructing pg 9.167 on osd.9 to repair2 B( }' p0 d. W, Y% t# _2 m
[root@compute01 ~]# ceph health detail , d% E4 d8 \/ O8 a  d
HEALTH_ERR 1 scrub errors; Possible data damage: 1 pg inconsistent; 1 slow ops, oldest one blocked for 51610 sec, mon.compute01 has slow ops4 d7 C% n* a4 e& C( h0 Q) H
OSD_SCRUB_ERRORS 1 scrub errors
$ `9 U5 O/ s0 L3 h0 SPG_DAMAGED Possible data damage: 1 pg inconsistent6 D1 A' _3 L* R& Q- f7 R
    pg 9.167 is active+clean+scrubbing+deep+inconsistent+repair, acting [9,11]! |0 l$ p: h  }" [; ~
SLOW_OPS 1 slow ops, oldest one blocked for 51610 sec, mon.compute01 has slow ops
7 T6 v6 l1 B# b7 l' t[root@compute01 ~]# ceph health detail
% ^# k: N! N( h5 R6 Z4 VHEALTH_ERR 1 scrub errors; Possible data damage: 1 pg inconsistent; 1 slow ops, oldest one blocked for 51615 sec, mon.compute01 has slow ops( c" @+ F3 {& I8 d
OSD_SCRUB_ERRORS 1 scrub errors6 u( i3 y/ Q, G' u
PG_DAMAGED Possible data damage: 1 pg inconsistent1 S) N% I1 L+ p6 f
    pg 9.167 is active+clean+scrubbing+deep+inconsistent+repair, acting [9,11]7 s! W6 a5 u! K* I& F7 ~
SLOW_OPS 1 slow ops, oldest one blocked for 51615 sec, mon.compute01 has slow ops4 u1 c* s8 y: A
  i" D# z7 _# l4 u2 |
[root@compute01 ~]# ceph -s) a: M, c5 |( C6 {1 A" r) j
  cluster:
- C8 o% R( E, m! ]5 E% H( Z    id:     2af51d38-db90-4a57-a43d-ea9f6ebd7482
$ q: h% w) c3 c6 i    health: HEALTH_WARN
0 _- r' \- ^$ p8 |0 o; u            1 slow ops, oldest one blocked for 51700 sec, mon.compute01 has slow ops
! M+ h0 T7 {8 X$ v% ?; U( X0 D9 z9 l/ |, [% \
  services:
& @. D' k: h+ o+ l    mon: 5 daemons, quorum compute01,compute02,compute03,compute05,compute08 (age 14h)
- r; p0 n( d$ L    mgr: compute03(active, since 4M), standbys: compute02, compute01, compute08, compute056 r' B3 j( Q! ?( A1 _& b6 Z! s
    mds:  1 up:standby
* u, ]9 s1 A+ \1 V    osd: 32 osds: 32 up (since 14h), 32 in (since 4M)
: H1 ^: {- {% b! _5 ^' R7 R6 V  ]  Z( f& p( T0 K; f# I
  data:+ ^, Z& {6 X, c$ d5 f" L+ k+ Q
    pools:   7 pools, 3712 pgs6 S% _# X5 X$ i$ U# T0 \0 h
    objects: 1.88M objects, 7.2 TiB# ]- A: v% u1 A+ t$ L
    usage:   14 TiB used, 129 TiB / 144 TiB avail' X/ i4 j; B6 \! U
    pgs:     3710 active+clean
0 B( t, ^2 t4 h( \* j- `             2    active+clean+scrubbing+deep! m/ p. X1 W) A2 t; `

3 _5 ]# D% f! P$ W  io:
7 ], t% g% U& l7 M7 e" P) y+ p    client:   921 KiB/s rd, 8.3 MiB/s wr, 1.17k op/s rd, 545 op/s wr$ G1 G9 z, m0 E1 ]/ c
* ~. U( ^3 G5 C
等会就出现正常的告警了。
$ E" t) |; a5 t, x5 _4 _  F0 M[root@compute01 ~]# ceph -s
8 g# h+ \% i* E0 [. D/ m  cluster:7 U! ?5 i4 |6 m7 m
    id:     2af51d38-db90-4a57-a43d-ea9f6ebd74821 p" e3 y: m& g' E, B4 l: u
    health: HEALTH_WARN
% N- C9 ^2 Q0 ?" c* a: [% r! F            1 slow ops, oldest one blocked for 51705 sec, mon.compute01 has slow ops
' U& _6 n* d# M# z) Q  D' Y- m( K; n7 m& S
  services:! D% a9 ~- j/ V% x! F  c
    mon: 5 daemons, quorum compute01,compute02,compute03,compute05,compute08 (age 14h)1 E' J/ I1 R' `4 T9 M: @
    mgr: compute03(active, since 4M), standbys: compute02, compute01, compute08, compute059 Q; H1 i. b5 ^  z
    mds:  1 up:standby
6 H. ?. h, I6 t1 i- q    osd: 32 osds: 32 up (since 14h), 32 in (since 4M)5 G; v7 C8 t# s" n: }
) m7 x0 a. R, n( b3 y
  data:5 p9 V7 a. l, O' J9 x
    pools:   7 pools, 3712 pgs0 n- l" ~% {8 R+ X5 D
    objects: 1.88M objects, 7.2 TiB
0 |8 M) f, v4 i    usage:   14 TiB used, 129 TiB / 144 TiB avail
7 E( V* z- Q0 E5 z6 {9 L/ |    pgs:     3710 active+clean9 _( q/ f0 W. G1 p  z3 Z$ w
             2    active+clean+scrubbing+deep
: N. J% y3 ?1 Y. P4 a3 m
1 ~% e% i* ^, @& m- n7 X% @0 `  io:  ~8 J* s6 f9 ]3 E' Q0 Q  t
    client:   698 KiB/s rd, 8.0 MiB/s wr, 901 op/s rd, 556 op/s wr
; c/ w- x4 b% `
" n  {* z& T4 f, J0 j' T! X( u[root@compute01 ~]# ceph -s5 Z2 l- i* w4 |' o/ U, K, |
  cluster:+ b; _# R3 t% V9 c
    id:     2af51d38-db90-4a57-a43d-ea9f6ebd7482
; Z! e% i  J+ z    health: HEALTH_WARN
, ]8 P8 W+ u4 P; k5 j" k. p0 Y" U+ y            1 slow ops, oldest one blocked for 51705 sec, mon.compute01 has slow ops
5 |/ i  y9 H# t  c- N' \+ c6 ?
, G% \" E- d. C, o: n4 V  services:
0 f. o; Z' K9 |# E4 G: V    mon: 5 daemons, quorum compute01,compute02,compute03,compute05,compute08 (age 14h)
. p- N! y6 f( H    mgr: compute03(active, since 4M), standbys: compute02, compute01, compute08, compute05
3 e0 X# h) s1 ~8 p3 ?    mds:  1 up:standby
& [* F& S% ~* Y4 `* R    osd: 32 osds: 32 up (since 14h), 32 in (since 4M)) a# Y1 A( h* T3 ?
% p! H3 d/ }+ y3 e
  data:
  N) X4 i% Q! E# W9 s7 B( {8 O    pools:   7 pools, 3712 pgs
9 [  Q+ e% J! o% f1 S    objects: 1.88M objects, 7.2 TiB- |* _+ P/ @( C& Y+ L
    usage:   14 TiB used, 129 TiB / 144 TiB avail
) s* t/ w! T& V" f& }    pgs:     3710 active+clean
8 I, U6 }, D( V5 ^             2    active+clean+scrubbing+deep
+ l5 f/ I4 o; ?/ _+ {2 K6 \2 @3 e
  io:
/ H: D" T! _: K" y7 d9 j  W    client:   601 KiB/s rd, 8.2 MiB/s wr, 787 op/s rd, 569 op/s wr1 ^4 E; s; s  T( u4 l
! w, h* L- {% d' b# Y4 K
检查下时间同步:
1 }* l& W- B( C% o! \1 T[root@compute01 ~]# chronyc  sources
& I- `  `5 L! b$ e: z210 Number of sources = 16 C. @8 ?5 R, v- E
MS Name/IP address         Stratum Poll Reach LastRx Last sample               $ C0 J  x! O. t5 Y& O* Z
===============================================================================
: n) M' \. w- p  K( G4 i: U4 I^* 119.28.183.184                2   6    27    26  +3312us[+7317us] +/-   86ms
4 o8 }6 @- H* Q5 D) }* x[root@compute01 ~]# ceph -s
- c4 v! M+ ]2 q& g3 P  cluster:
9 J4 F: {  D/ e$ X1 W    id:     2af51d38-db90-4a57-a43d-ea9f6ebd7482
5 u7 G- _# h- a+ @6 }* n    health: HEALTH_WARN
& e  M- O4 \/ P            1 slow ops, oldest one blocked for 51780 sec, mon.compute01 has slow ops: z/ k- d/ ~) S$ S# ^/ O& Y

) T/ k) Z. W8 w2 i5 L$ z( X) O  services:. k% w- Q4 h9 ^" q
    mon: 5 daemons, quorum compute01,compute02,compute03,compute05,compute08 (age 14h)
5 J; Y, S' X) {) x    mgr: compute03(active, since 4M), standbys: compute02, compute01, compute08, compute059 F. [0 D, ]8 G
    mds:  1 up:standby4 d: A0 k! r9 ?
    osd: 32 osds: 32 up (since 14h), 32 in (since 4M)
; I9 D8 s1 a" P* [. ~; k) J/ R$ d2 k7 ^* k1 f
  data:( K4 T" a2 k, j2 T* H& b8 i
    pools:   7 pools, 3712 pgs
% ]  ?3 [: Q4 G6 ~    objects: 1.88M objects, 7.2 TiB& S! }7 N" x, x
    usage:   14 TiB used, 129 TiB / 144 TiB avail- T( n7 A% r4 C2 k0 d( b
    pgs:     3710 active+clean
0 I4 C1 n' b; q- N7 |             2    active+clean+scrubbing+deep6 E; M5 q8 X+ ?% d
8 k6 J7 g0 c* |" ^
  io:, h; x" ^% m/ i
    client:   968 KiB/s rd, 9.1 MiB/s wr, 1.21k op/s rd, 624 op/s wr
5 @9 h% ^0 t+ u; x$ h( \7 H% \
8 ?! E' Z0 S. d' T[root@compute01 ~]# ceph health detail ! Y6 T/ \/ L  B6 u, x" Y3 {0 ]
HEALTH_WARN 1 slow ops, oldest one blocked for 51795 sec, mon.compute01 has slow ops
  Z, X- q# ?$ Q# LSLOW_OPS 1 slow ops, oldest one blocked for 51795 sec, mon.compute01 has slow ops6 J" ?( m1 I" c; m+ @2 Y$ ~
重启下mon.target服务:
6 G) b  u' ?0 i/ R  }& l( `[root@compute01 ~]# systemctl restart ceph-mon' M- s- d3 t# o5 q0 m
ceph-mon@                   ceph-mon@compute01.service  ceph-mon.target" ~0 `5 E; h: P1 X6 E, S; {( L" ^/ y
[root@compute01 ~]# systemctl restart ceph-mon.target + Z2 e& I  r$ X  k* N5 F
查看状态:
" Q( z# ^4 O& P8 ?% x1 Y[root@compute01 ~]# ceph -s: j3 f# \( ?/ d: |5 |* t$ v
  cluster:- v# m! Y% Y% a6 ^( s
    id:     2af51d38-db90-4a57-a43d-ea9f6ebd7482
' W8 w) q: H; P3 t    health: HEALTH_WARN( U. M) z6 g+ _3 j6 e& q. Y. I7 q( L
            1 slow ops, oldest one blocked for 51855 sec, mon.compute01 has slow ops2 {5 u2 }& K* [: e/ }
- k, Y4 R$ t6 Q' r
  services:& s$ T- P2 Q0 _- \: S+ C
    mon: 5 daemons, quorum compute01,compute02,compute03,compute05,compute08 (age 14h); s; R& i2 `$ ^7 E" A( U+ M
    mgr: compute03(active, since 4M), standbys: compute02, compute01, compute08, compute05" j' K1 J$ B/ {/ |& U: w2 K
    mds:  1 up:standby
! T! w9 s" j' @1 [- `  A5 R    osd: 32 osds: 32 up (since 14h), 32 in (since 4M)6 I, E) [* w/ F' }- j& o' ^0 S/ [5 t  O

4 `  V2 c" e' @' O" |  data:
  j. d8 g& ^& ]1 g7 n! F    pools:   7 pools, 3712 pgs; C; Y/ R* @7 e8 N# P
    objects: 1.88M objects, 7.2 TiB+ F- B) V9 [" w1 o8 \
    usage:   14 TiB used, 129 TiB / 144 TiB avail
- w; c9 L$ H* v+ R: l) i( f    pgs:     3708 active+clean
" v$ }* j/ R. p1 E+ i6 E% x             4    active+clean+scrubbing+deep
, p) {# @% b. }& H: A/ t; d+ E* t. W' y8 O2 O" z
  io:
. Z: q: k# i& t; M  K    client:   782 KiB/s rd, 7.5 MiB/s wr, 989 op/s rd, 463 op/s wr
: j! V' Z8 B7 E, x0 ^1 a7 {3 @1 h% P) h
等几十秒钟再查看:6 L7 [! U9 z# f4 N  T
6 ?* p1 q6 q6 Y" w
[root@compute01 ~]# ceph -s/ B( P* k! m( n) S: g3 g
  cluster:
8 y9 L0 m$ a8 p! r4 O    id:     2af51d38-db90-4a57-a43d-ea9f6ebd7482) g8 h1 a; ^% T7 q
    health: HEALTH_OK* a5 ?. G4 j4 V1 E

+ U  F% B/ O4 v  v3 O0 I, h8 L  services:
- e& A3 M0 D+ s6 }' g+ w    mon: 5 daemons, quorum compute01,compute02,compute03,compute05,compute08 (age 3s)/ {& K) W5 |; q2 j$ N0 w9 z
    mgr: compute03(active, since 4M), standbys: compute02, compute01, compute08, compute05
4 q1 ?8 g: r/ o% S3 l  B' D    mds:  1 up:standby' W, e" ~9 l; o! y- a' Y
    osd: 32 osds: 32 up (since 14h), 32 in (since 4M)
( [0 F! h  P/ f# F0 {4 B7 U  ~: \/ B) x
  data:" W8 U  `% i3 Z3 e0 z
    pools:   7 pools, 3712 pgs9 f) z  a6 g$ E
    objects: 1.88M objects, 7.2 TiB
! k; g; x9 I; x- p4 W# o0 q    usage:   14 TiB used, 129 TiB / 144 TiB avail
/ G+ y7 r% w  T% F: B1 F+ J. y$ N    pgs:     3708 active+clean& J, u: \: I5 p9 ]" G
             4    active+clean+scrubbing+deep4 D6 g; b' z4 R' F
6 T. z! X1 D8 r
  io:
4 f/ Q- v  k5 M% i1 N    client:   508 KiB/s rd, 9.1 MiB/s wr, 667 op/s rd, 621 op/s wr
: {5 j$ H$ H5 n! @0 j0 e! p3 M9 j  \: |2 P
[root@compute01 ~]# ceph -s
" p7 l+ X5 _- ]+ L9 f; M  cluster:
5 j. R5 s" M4 n    id:     2af51d38-db90-4a57-a43d-ea9f6ebd7482
% ?) l3 q) Z( z7 V    health: HEALTH_OK
9 Q. s6 i$ s, B
+ s' i7 c( z8 g4 c9 H6 V+ K; C9 i  services:( l% H! P9 |2 _
    mon: 5 daemons, quorum compute01,compute02,compute03,compute05,compute08 (age 5s)+ N4 N, H/ H! j2 ]8 {* _, O
    mgr: compute03(active, since 4M), standbys: compute02, compute01, compute08, compute05
/ u, K' K. s; \) m6 K1 j% p    mds:  1 up:standby
$ F8 f1 ]( W% @/ X    osd: 32 osds: 32 up (since 14h), 32 in (since 4M)* ~+ I1 Y4 l3 n9 M# i* s: f

% f7 c; ?# W: }' G  data:
. P& G; r' h) ?3 l. [$ [9 M1 g    pools:   7 pools, 3712 pgs4 X: B+ A* k' k8 r! U% }- B
    objects: 1.88M objects, 7.2 TiB$ ]  x+ H' h0 X$ U. m
    usage:   14 TiB used, 129 TiB / 144 TiB avail
) q: P$ e  t) _$ P3 H( @0 l    pgs:     3708 active+clean8 b9 p8 J! z7 P, ]  V
             4    active+clean+scrubbing+deep
# i; w9 {# [4 p/ m1 q" y# W% B8 ~) x& V0 f* }4 y  [  S) `
  io:
: x; Y) o: `* h2 a$ ?) j    client:   680 KiB/s rd, 10 MiB/s wr, 869 op/s rd, 723 op/s wr
* z+ d% }! y8 z' T
- W- u( v8 w5 b" y3 H3 t' a[root@compute01 ~]#   C$ C0 J. I9 e! `* `# h5 y$ G

; o6 j. S$ M& V& S( }) l1 ?6 t状态正常,问题解决。
: t- U" U0 K  v* u' ^4 x" @5 ~5 R
# K. r. b3 L* i3 i+ [7 P
您需要登录后才可以回帖 登录 | 注册

本版积分规则

返回首页|Archiver|手机版|小黑屋|易陆发现技术论坛 ( 蜀ICP备2026014127号-1 )

GMT+8, 2026-6-12 02:01 , Processed in 0.017929 second(s), 23 queries .

Powered by Discuz! X5.0

© 2001-2026 Discuz! Team.

快速回复 返回顶部 返回列表