|
|
楼主 |
发表于 2023-5-22 17:59:35
|
显示全部楼层
1.查看集群状态
2 k7 A! l# T3 ^ B, `[root@k8snode001 ~]# ceph health detail
2 _8 W1 c# ?1 I# t
7 U, \- _% G1 T1 ~7 WHEALTH_ERR 1/973013 objects unfound (0.000%); 17 scrub errors; Possible data damage: 1 pg recovery_unfound, 8 pgs inconsistent, 1 pg repair; Degraded data redundancy: 1/2919039 objects degraded (0.000%), 1 pg degraded
- B, y- W" j. k3 n& v8 A9 C+ m
8 E t7 \3 a1 n( JOBJECT_UNFOUND 1/973013 objects unfound (0.000%)3 L$ g2 h/ j" D8 M& i
: R+ s' N& A9 W& e# ~' Z2 j pg 2.2b has 1 unfound objects L2 N0 j4 C0 Y8 o0 y
( X% x3 ?, A. ?4 J3 a5 p/ Y6 \- dOSD_SCRUB_ERRORS 17 scrub errors7 R2 \4 h% R9 A' W
" n( J: }6 U1 O7 G# ~
PG_DAMAGED Possible data damage: 1 pg recovery_unfound, 8 pgs inconsistent, 1 pg repair- D# m1 Z: w* c3 S: T B
2 T, u5 m6 S# O& P) [5 Z) c2 P
pg 2.2b is active+recovery_unfound+degraded, acting [14,22,4], 1 unfound
) q s: c- \( L( H
; v: M' Z, B: m$ U pg 2.44 is active+clean+inconsistent, acting [14,8,21]
: n) L* ?4 k' T5 _+ T8 _/ Q9 s) @3 x K# ?( u% `" g/ ]( U' s
pg 2.73 is active+clean+inconsistent, acting [25,14,8]
- r! g5 {; N& H6 c; t2 L. E- p* `1 {& y9 @6 _
pg 2.80 is active+clean+scrubbing+deep+inconsistent+repair, acting [4,8,14]
% D) h- I+ ^, t) s8 ]& N I3 k$ e6 q
pg 2.83 is active+clean+inconsistent, acting [14,13,6]
/ {* q) \5 A1 r+ I
, b/ C8 R! j/ e) {! E* A5 S pg 2.ae is active+clean+inconsistent, acting [14,3,2]2 ~- o' n3 ~1 f; k2 Z
+ G2 d5 N. k9 T" O
pg 2.c4 is active+clean+inconsistent, acting [8,21,14]
9 v- \7 w$ ^4 y, V/ e3 x) [$ L' j) ], U+ M* k7 ?& v
pg 2.da is active+clean+inconsistent, acting [23,14,15]
6 i" A7 q" Z( F3 e: ^- ?/ `5 E+ ]5 x9 `5 Y s; U
pg 2.fa is active+clean+inconsistent, acting [14,23,25]1 J& Y% F+ c O
! x5 T: L9 A0 K; ZPG_DEGRADED Degraded data redundancy: 1/2919039 objects degraded (0.000%), 1 pg degraded& w" |1 [0 v+ l, \7 H$ }
4 o% o) `4 a' a8 p' k1 x pg 2.2b is active+recovery_unfound+degraded, acting [14,22,4], 1 unfound
4 R6 p) [: T3 v6 m' n7 J* C: m! D1 |4 @
5 h2 E% k5 K' t$ r9 Q `. m5 q Z3 V
2 t9 L; s+ f& N6 Q4 U* a+ N
: ?: l; q( T" D4 V% l+ p1 V从输出发现pg 2.2b is active+recovery_unfound+degraded, acting [14,22,4], 1 unfound
& H, x( K( i$ g& K" S/ s# h! |
% E% e1 ^ H4 `; g$ M* v( q现在我们来查看pg 2.2b,看看这个pg得想想信息。# w: p+ B, {& v
6 o& E# Z- E6 ?9 _- e I) ^
, g `2 i' ?+ @+ S7 z( Z; p9 x' y
( X* f& Z7 P9 m/ ?' c7 d8 W[root@k8snode001 ~]# ceph pg dump_json pools |grep 2.2b, G# Y, g* g) u5 o6 n& J
" N4 D& d5 B# M1 Fdumped all q) L, y6 a# N l( F
! {. J, _7 e8 v- B: C$ X# {
2.2b 2487 1 1 0 1 9533198403 3048 3048 active+recovery_unfound+degraded 2020-07-23 08:56:07.669903 10373'5448370 10373:7312614 [14,22,4] 14 [14,22,4] 14 10371'5437258 2020-07-23 08:56:06.637012 10371'5437258 2020-07-23 08:56:06.637012 0
2 b* n% _, o8 j1 J: V% h$ s' K2 M9 D7 M! j& s5 \
- q$ p- e" z' t3 ^) q+ z& Z, z* u" g; k v. F- K- Z0 N2 \
可以看到它现在只有一个副本# `7 i, m8 `# h! W
. M3 [' b& K+ T3 e2.查看pg map
# C _( _) P0 o4 g4 } H/ \4 x7 j2 g/ Z& z
: R# o0 |# g4 M5 @% r7 j1 L J[root@k8snode001 ~]# ceph pg map 2.2b# ^( ?# R" C/ L1 V) R% m: ?# a$ ^/ L( ^
3 l' U$ v% I( J) y) C7 Wosdmap e10373 pg 2.2b (2.2b) -> up [14,22,4] acting [14,22,4]) l% `3 R5 S1 U+ N
' _- R _! {8 \" i9 M5 G# i5 N' f5 n: P6 X5 k
# Z/ P/ _' [ Q. u; W. r3 r从pg map可以看出,pg 2.2b分布到osd [14,22,4]上
; G* Y9 g# H/ _
5 I3 p3 _- L/ b6 ?; i7 D5 I# \
* C. W6 [2 O! |, d7 M7 E$ q3.查看存储池状态! z& A' R7 U! T. p n9 {: Y$ U
8 A% r \0 ^2 |- q6 g9 q' q
! y. d/ }& m" y9 _[root@k8snode001 ~]# ceph osd pool stats k8s-1
7 z7 y$ b4 Q" C+ x" Z& J: n+ C+ ]+ n- ?8 C) f4 v
pool k8s-1 id 2
+ V( P/ `$ _0 O' L1 E
) A. f, b) G6 R 1/1955664 objects degraded (0.000%)7 {6 K7 V- P2 n8 c K- x" ]8 j
, t1 J% u, I. q0 a$ y/ |) C
1/651888 objects unfound (0.000%)
`' u2 w- D8 c7 Z) E, F" s& v# o. N R$ B7 C; z
client io 271 KiB/s wr, 0 op/s rd, 52 op/s wr
6 ^2 m5 M- [5 v6 a) c6 V2 L0 X7 f& i
4 Q* H q" t ?* d3 H5 J. C1 i, ~3 f6 i
; l9 C* V9 z9 A9 _8 H; n5 ~+ e
[root@k8snode001 ~]# ceph osd pool ls detail|grep k8s-1. I- `2 _" N1 ~' ], K6 A
$ u% u5 N* ^1 k( ppool 2 'k8s-1' replicated size 3 min_size 1 crush_rule 0 object_hash rjenkins pg_num 256 pgp_num 256 last_change 88 flags hashpspool,selfmanaged_snaps stripe_width 0 application rbd
2 N* n" t& S; A) N# i4 x) S7 Y. n, Q: ~5 U
8 b% S0 u C/ q- V% Z
' k% |1 v: e* e6 m
0 t1 O) Q* {. L, P) l! I4.尝试恢复pg 2.2b丢失的块
. `. y" j) t |4 r0 n0 h; M[root@k8snode001 ~]# ceph pg repair 2.2b; V* q: b6 Z7 J \
! W- J9 t* r5 B3 d$ C" c4 W: Y( ~
2 a3 B% X$ b0 d q m+ X3 }! }3 o" G# m
如果一直修复不成功,可以查看卡住PG的具体信息,主要关注recovery_state,命令如下8 u$ }7 e, M! Q/ B& q, d
/ Q' K6 b1 S# }( u5 g Q# z$ N/ P+ [7 Q# Z; @* k4 ?
5 ^* \0 O2 V7 ?
[root@k8snode001 ~]# ceph pg 2.2b query
7 d) e3 r: r1 Z% {& U& z* t
2 \( f9 k2 q' ^! K3 s9 U O6 k{
6 d6 |, P9 v/ v# v4 E4 ]; @/ _
) H) d7 b# K7 G; ~1 r "....... P, }9 u/ f7 M
- H9 H; X$ v5 x' h0 { Z8 v
"recovery_state": [! R% x- g5 c( Y+ ]
0 l4 Y3 ]. r% q$ j8 Q( R {
8 m/ Q) u9 X! o
4 d5 ^) F' J( E) v0 p "name": "Started/Primary/Active",! Q$ \! q2 l5 T4 }* x) }
9 x- k8 T: w7 F7 m# T "enter_time": "2020-07-21 14:17:05.855923",
/ i7 S. s w0 G5 r3 E4 d
" r+ I7 @7 y0 p# } "might_have_unfound": [],
. a9 v% w( n' \, X9 W2 S) f& O( w. \. v' Z. ]* B
"recovery_progress": {
! z, \& O. Q8 M3 g; ]
$ _+ J6 i/ l _ "backfill_targets": [],
: p( f; ]- `( | B8 b8 A! W* ]1 b1 x: ~1 Y0 Y9 H
"waiting_on_backfill": [],
% W9 n4 m; ` b
9 f) P* ^' N% B3 P5 [ "last_backfill_started": "MIN",; v" D( h: \0 R( Y2 F4 K
* N) X4 {( w' r# V$ s
"backfill_info": {0 s, Y" b0 D5 A
$ s" X9 S7 K* p) ` ]
"begin": "MIN",
% d2 ?3 q6 a3 q. U+ p" b
+ i$ m% Y+ ]4 t+ D- V3 s# ] "end": "MIN",
" F$ @+ E. V3 P7 b. o0 s& O( b& }- o- M- J3 B& H: W1 `1 X
"objects": []+ R+ ^9 F3 _1 s# p, T( G
9 _: ] j+ n4 d2 G5 w! g' k0 ^: ~ },
5 R* d, A* G3 c. t6 k. b/ G% U, ]& z+ i8 n2 F
"peer_backfill_info": [],
4 J5 `8 B5 k+ r: u1 B
1 S) f, H, w3 e& ~' y2 {* N4 n "backfills_in_flight": [],$ m3 M9 X; _/ ?% R
9 b) r* @6 W. {+ i ]- ~ l" ^ "recovering": [],& Y, d1 ?5 o3 h' _
9 U* h# D5 \" [: a "pg_backend": {: K% O2 b5 g3 w# C* x- E
9 z* @! Z% F9 h$ V
"pull_from_peer": [],1 }! _8 s& Y' g! w b) ^! K( O
, W4 b* T! O% |- l) Z
"pushing": []& M' K! D; b/ ~3 |
Z6 d! ^6 H4 m- b% G% E
}, J. s. N# A+ B N( K" F+ z
* p% P# {2 a' R5 G: ^ }, a$ L, I4 L: T! a
$ [" T T+ ~0 I8 L# L/ |5 O "scrub": {
" N# o5 G' `( I0 l; o: a- {* ^* t, u# a, J- t" d, w; w3 D
"scrubber.epoch_start": "10370",
+ W; L# p: T2 w& @6 k' u6 g* b2 F+ D$ u
"scrubber.active": false,* Q8 Y6 t: y( @% a+ w0 H% N
! q P' G& U }, i; `- Q; a$ ~ "scrubber.state": "INACTIVE",
! K: T" p/ j- w1 }9 k) }
' S! ]* M2 O( P5 j1 d4 g5 ` "scrubber.start": "MIN",# N3 y$ J J; m' C9 |; I9 M
7 v& C( N1 G! [' t "scrubber.end": "MIN",
: X5 b: P/ y" k# A Z+ }! D) t d- r, a% X# v
"scrubber.max_end": "MIN",
5 i; }# ]+ `( S
+ p8 j1 n- N8 I% s/ w; O- d& C "scrubber.subset_last_update": "0'0",5 _2 R8 K1 k! D% ^
, X2 B- ?- F( y* e# G+ k' p0 {3 A "scrubber.deep": false,# ?/ v$ }! S# U# M
' E& e; c7 D! S7 ^) K+ I
"scrubber.waiting_on_whom": []
) j8 |1 O0 v3 B/ B$ C+ h+ l
7 T+ }" [1 ]4 i3 M6 s' B2 o }
6 _) W* o3 Q. r- x, {
3 d/ H3 }5 F5 q/ u6 z }, O3 s, Q" L* I
. r! {% F6 j) Q6 A; u$ q {( |4 v# O1 e1 J) Y* M. I
7 ^3 `% R+ H7 m* I" ` H
"name": "Started",; W. L! {; d5 z1 w9 a
5 }0 Y9 \. x7 a6 w6 p* q0 p% w; S
"enter_time": "2020-07-21 14:17:04.814061"
9 k! G A3 G5 J+ W1 l/ z% v! d! ?% i" {: f7 I0 Y( ]/ l9 g- o x j
}9 A3 y1 r5 |' |6 h8 V" T
( q1 j1 |, i9 M! _! X ],. @4 z( m. {5 b) n- F9 _' E
$ z6 U8 h% G- X+ d/ a. A
"agent_state": {}4 a( k& n7 j3 p( ]
. o9 v+ C8 ^. k& l4 n. @, g8 b1 J+ ^}( N/ e& V6 R' R- W% L x2 ]
0 r+ A; x' G2 k3 V0 h, U6 p' Z: j T! X5 l3 w- Z
! h5 [ W! K/ ]1 F
如果repair修复不了;两种解决方案,回退旧版或者直接删除
! c& v. D( A0 l* _, K0 d+ p a. M9 s1 Q1 Q* |3 e; |; N% {
5.解决方案
5 H: U3 O/ l8 h( @8 W, g回退旧版' k! q H+ l- P2 T1 |
+ k$ _2 ~9 n6 f' Q# g; n
[root@k8snode001 ~]# ceph pg 2.2b mark_unfound_lost revert
0 w, h- ^+ q8 k3 z2 o, S4 G3 ]) @1 L4 G7 s2 v. @) o
直接删除
) Q3 E% g9 Z4 y G0 r ?& w; F
/ q1 o: u- ~8 w- z# f m" a[root@k8snode001 ~]# ceph pg 2.2b mark_unfound_lost delete5 V7 v8 {' B) ~& ]. w
5 G, j- _0 N7 g; G) b0 k
8 a* v6 W9 R) u# }
4 \3 {. `" \9 ?! |8 [( v6 [" S6.验证( P9 B) f, q5 J/ \6 e5 _' T
我这里直接删除了,然后ceph集群重建pg,稍等会再看,pg状态变为active+clean
2 Y$ d4 s, q1 z' W
( |/ R2 y: z8 L6 I2 W% W' I[root@k8snode001 ~]# ceph pg 2.2b query% X b. j [+ `% P+ h8 |5 v
) W7 ]4 X) u. P2 L{. \- I: f% F, T
* y2 n: }* k$ }7 P% e2 U: t7 g( W
"state": "active+clean",8 g; ?0 x6 U6 F c
k1 Q) ~& j: A! G6 f
"snap_trimq": "[]",
0 i2 }7 E$ Y6 \* W/ ]4 K: N1 F9 k5 d6 Q
"snap_trimq_len": 0,
& m- N/ P7 o. ] \6 ]$ T9 }- M! i" |0 y, D, u6 O- v3 b
"epoch": 11069,6 U# [1 m' p3 s# H$ o/ e4 S6 o
. ^. H: L6 x8 d. N; R& }7 x/ _& A G "up": [
8 ~3 Z$ f1 w: O. y) K& i0 @$ l. m% ^
12,
$ L0 S" B# F0 `8 n, J' o; `8 M5 F9 R, f2 v
22,
; a& F8 s; ]! K8 B" d% j/ m. S4 [
47 [0 n6 e3 a* V: Y" R
{" u0 _ O2 e1 G
],/ X% O+ Z- G! v( C" ~
/ m) `; T) \2 F( S9 K4 U9 g" |
' Y% Q$ b0 g8 N/ e9 ] j: Z7 [6 { r; N* r' u3 D
再次查看集群状态
+ ]$ y% M9 T, |6 K e9 J+ T9 G. U( L3 C
8 {5 w) e$ P9 c: V; | v
. `0 _' ~4 i: |$ O+ U8 j* {[root@k8snode001 ~]# ceph health detail3 p+ H* x4 O: V2 p1 P! A3 M
! W" |: S7 G- d6 g: F
HEALTH_OK+ Z. B3 d$ C5 d& s
% W a h e+ H6 p4 U* q. Q' `/ J
|
|