|
|
楼主 |
发表于 2023-5-22 17:59:35
|
显示全部楼层
1.查看集群状态
) k7 |- K. N0 K; ?0 R[root@k8snode001 ~]# ceph health detail/ H9 _% }; `2 T% w h5 X9 J8 D
2 w5 P8 S' d& z) BHEALTH_ERR 1/973013 objects unfound (0.000%); 17 scrub errors; Possible data damage: 1 pg recovery_unfound, 8 pgs inconsistent, 1 pg repair; Degraded data redundancy: 1/2919039 objects degraded (0.000%), 1 pg degraded- ? O% ^- J( Y3 e7 q
* }3 P1 d# W# ~ o6 F# _# Q
OBJECT_UNFOUND 1/973013 objects unfound (0.000%)* h* ^* j! U) O1 l
# {/ @, I; ^0 }9 a0 C
pg 2.2b has 1 unfound objects9 z- |* p) I3 i- g
" I# U9 W/ I) VOSD_SCRUB_ERRORS 17 scrub errors
7 w! e5 |: [; @
: x1 _% J- d# H9 K2 kPG_DAMAGED Possible data damage: 1 pg recovery_unfound, 8 pgs inconsistent, 1 pg repair# ^! X/ [; [1 Y# q
" ~! t) T6 M% i, { pg 2.2b is active+recovery_unfound+degraded, acting [14,22,4], 1 unfound
) A! |1 q R& b9 N& `
3 ^. J: l( m) p6 r pg 2.44 is active+clean+inconsistent, acting [14,8,21]
- @, \; x; D8 t4 i3 r- j' H/ o: U- L7 o6 c$ E
pg 2.73 is active+clean+inconsistent, acting [25,14,8]9 w% } _+ r$ E) H! q' H
& O4 L* I2 Y. v3 E# d
pg 2.80 is active+clean+scrubbing+deep+inconsistent+repair, acting [4,8,14]2 s4 H. Q* m, G: {* H
9 r5 i2 S! A: r. l! @" M
pg 2.83 is active+clean+inconsistent, acting [14,13,6]4 y- y& e. e# D0 v$ G- `9 ~# ?. C
6 o( h/ E8 H: z: D
pg 2.ae is active+clean+inconsistent, acting [14,3,2]
: h' t# v' B: e; ~, g4 e$ n0 q
# A" i1 U3 P/ \4 X1 [: B8 G pg 2.c4 is active+clean+inconsistent, acting [8,21,14]2 h( Z* m! N- j$ S5 R ?8 Q& k
& y! r; }! X: U- O# ?. ~8 ] pg 2.da is active+clean+inconsistent, acting [23,14,15]
* D" v9 q. @+ V2 J
! t. A; O" L; h |- ~ s: B pg 2.fa is active+clean+inconsistent, acting [14,23,25]
. c" ^2 Z, ]) B' i+ N- Q) [1 o7 t! [1 J& \7 x
PG_DEGRADED Degraded data redundancy: 1/2919039 objects degraded (0.000%), 1 pg degraded
% F3 T1 ^' o6 Z5 ~6 M* f& i ?9 _ ]) u! f
pg 2.2b is active+recovery_unfound+degraded, acting [14,22,4], 1 unfound: F* K7 \) x8 `3 f# s
# J0 x- D& E* J+ b
4 `- f8 R. o8 O. E$ L8 `% }3 D7 M; H" Z
从输出发现pg 2.2b is active+recovery_unfound+degraded, acting [14,22,4], 1 unfound0 t8 w% q) T# D9 S
" |5 l9 w v8 X0 _7 y2 y7 e# U* j
现在我们来查看pg 2.2b,看看这个pg得想想信息。6 q2 A8 _$ i5 x
. r; j* b, [/ A$ ^! p
' E8 R, e- Z! Y7 a; q2 Y/ q
6 S8 z6 {( s! `0 K[root@k8snode001 ~]# ceph pg dump_json pools |grep 2.2b3 |9 ?' Z) V6 j0 ~. C
% }. k5 L) Q8 z' n7 g
dumped all
! D; M. |6 z/ k" m" ^1 P
/ Y5 g# @ p) E0 Q% r3 c# w% A2.2b 2487 1 1 0 1 9533198403 3048 3048 active+recovery_unfound+degraded 2020-07-23 08:56:07.669903 10373'5448370 10373:7312614 [14,22,4] 14 [14,22,4] 14 10371'5437258 2020-07-23 08:56:06.637012 10371'5437258 2020-07-23 08:56:06.637012 01 }/ m: s1 e8 r7 Y- `! _8 s
8 Z6 ]" G g, e5 `+ U# R0 f
3 |! w7 Q% D& @5 I4 ~; y
5 I) R0 I2 a/ C8 {! q: l5 ^+ s6 G8 A可以看到它现在只有一个副本' w7 ?: T, F6 U% P8 B) t4 W
c7 `* ]. D6 x2.查看pg map
2 h( |' @2 ^0 g, n. u3 o
3 { {& X! `$ x5 f0 ?- v
$ [" n. r; x) V( o, e9 Y1 b[root@k8snode001 ~]# ceph pg map 2.2b/ g+ V% N O. P" n
8 r" Z, q5 \ t" r3 C9 y
osdmap e10373 pg 2.2b (2.2b) -> up [14,22,4] acting [14,22,4]8 f/ ]9 ^) ]* N$ ]4 u7 n
$ s5 J4 B+ k7 B/ K7 x7 V) F
7 Y5 V6 J- s& q q; e
6 H: q, }3 W f' [2 V3 o
从pg map可以看出,pg 2.2b分布到osd [14,22,4]上# ^ ^5 c3 h; }$ S3 u) X4 W4 y
L% R9 x5 _$ P' S5 F. L9 u- S' n' E# W% q5 s0 y
3.查看存储池状态
& ]# A0 q: X4 N; w+ ], D* X7 ^
9 K1 a' g) q# s3 q$ D9 y8 q8 }' f
; C" }9 G# \; U" k' J% Q[root@k8snode001 ~]# ceph osd pool stats k8s-16 p! l" [4 Z$ C7 y& U, [
; D: @) h! Q) j& y) U
pool k8s-1 id 2
% @5 ?8 G- T+ Y% `5 f0 r, v& c5 B' E; q& M0 q) }. l- q) [
1/1955664 objects degraded (0.000%)/ ~, y% |* R5 f3 W- U/ M
8 t' a# r9 y" m; p( ~
1/651888 objects unfound (0.000%)8 d8 f/ J& E: |
) q- v; M% n0 u
client io 271 KiB/s wr, 0 op/s rd, 52 op/s wr
5 w& M9 Q- v2 P' r- n Y+ g; s" c( O8 i/ M4 f. \
! R* ~& y2 B$ X% a8 b. E O/ v5 ]$ l' G& Q; |% O9 M
[root@k8snode001 ~]# ceph osd pool ls detail|grep k8s-1# u2 O) @+ [. f" q
% X( z' ~) k( W' I1 j: F$ Cpool 2 'k8s-1' replicated size 3 min_size 1 crush_rule 0 object_hash rjenkins pg_num 256 pgp_num 256 last_change 88 flags hashpspool,selfmanaged_snaps stripe_width 0 application rbd$ D, r. p1 R- i3 S6 T
$ F$ G5 F! Q' s+ ~/ S
2 C i0 Y4 n3 M( E" P
' ]3 c" G% O( r9 Y4.尝试恢复pg 2.2b丢失的块
9 Y: b; `1 ~& l' o0 Y! f[root@k8snode001 ~]# ceph pg repair 2.2b7 z+ O' x0 i" u" p& n/ a0 n6 k0 ]
2 [) A6 W0 l' o: i/ U
4 u5 u0 V4 X: @7 P* h" P3 k% T
! v* J/ L- w6 C- e7 s如果一直修复不成功,可以查看卡住PG的具体信息,主要关注recovery_state,命令如下; T) p! H8 t& ]# q% r& x! p
) E' R+ ~, F/ Z- H/ e, N6 C1 G, {; q8 O' x3 Y5 I# _
! `- \7 g% L! m. V[root@k8snode001 ~]# ceph pg 2.2b query7 ?% _9 g/ N! s# }* d# b( v
; o9 T' `& Y2 N1 P% x' R6 Z* H$ q+ k{
2 I k! q4 N& z: x' J
: @& e( F* u# D% x2 S "......2 |8 ^2 @& M$ M; @! V# l, |
- X$ {( b- n9 c1 _# x- \
"recovery_state": [
7 n7 X& B6 @+ Y. X7 v( Y" P
! `, S, [: e! W8 A1 A1 h* { x( \0 e {
2 a& C1 }: _8 s) J/ i8 ?
! C! V, ?% T* C/ Q, `3 K- p "name": "Started/Primary/Active",
* a( x L& O# ?6 Z F) x
3 G$ p! D( u7 W3 U# O) o "enter_time": "2020-07-21 14:17:05.855923",% r0 z _7 k+ V+ f; G3 q% r# v) b
" @: L# z5 `' s3 N" b
"might_have_unfound": [],
& D- I3 j! P' T3 c/ x6 M j/ q+ z% H# k5 W; g
"recovery_progress": {8 K, z" B( k; l
. ], Y1 | T5 J5 v8 ~7 u
"backfill_targets": [],
+ |4 F4 f! u9 v0 A0 I7 z' l" w" F5 p4 o' w
"waiting_on_backfill": [],
+ o# c6 t& N, |) r2 _! K7 y" f; K+ ?5 a1 r, p
"last_backfill_started": "MIN",. _7 H$ U4 g+ C- F0 d
& m: w0 d3 \7 f+ D+ X
"backfill_info": {+ a5 D8 O; r8 `( k$ f7 ^& V% f! @
$ l6 @8 S9 A) D( s. [( @( ^ "begin": "MIN",
4 a' o! Z2 o; }1 c: K9 n
* C" X) F5 O+ a& V- g T& e "end": "MIN",
9 D. g7 I4 i7 w% [5 J4 D- \! N
: `. y; {/ l! T4 a0 n "objects": []. i! e7 ]. O# X/ B) h9 z8 S
}( y. M6 J6 O8 O; Z5 ?! }5 W },
( [- y# H7 o2 G4 a2 t' A, @( @6 q& `* a( E! q- O
"peer_backfill_info": [],( a. f" P7 y7 F A+ @
6 i$ {9 h4 S% ]% c: ]3 t7 ?
"backfills_in_flight": [],% G+ S2 t) u9 t* A5 R+ \! M
6 t$ i9 m$ A$ ~, T# ]$ m "recovering": [],
- q9 m. m0 H9 l) i9 ^
! {! t, K, ~: T) b) @3 } "pg_backend": {
: O4 k: N# R' o+ A$ g. [; [
' S7 K9 {$ D" E1 w8 q "pull_from_peer": [],' H. I! u; b9 }3 `
' _- x, [- {1 ?3 C S! {5 m "pushing": []6 c' L+ J+ s; _, n$ j1 v% X) V
5 W H0 W( E) n
}+ D9 i) S$ U+ a" `
9 _* K8 x; l4 u0 K# _2 K
},
( R& Q( O3 u9 H! }* G
4 P& A9 u2 `/ T% }' Z3 Y6 a "scrub": {, i: ]3 b5 V/ _5 C* _% S( ?
p0 B* m+ }$ D1 r% |9 X "scrubber.epoch_start": "10370",7 n& }7 j- _/ ^! i! l4 h
9 X! A0 ]" [1 p! d2 A! }7 S "scrubber.active": false,
! L" A2 E" L% @7 B5 q" X- N" f4 k2 t2 ]2 Q" Y% k9 `" ^7 u; C1 a
"scrubber.state": "INACTIVE",
2 m1 j7 p# W- p/ Y% Z" o7 S) J: C
1 a4 ?/ C+ M/ D5 S y% e1 S5 b "scrubber.start": "MIN",
& A4 z- r4 u" C& r
! ~' [* ^$ A# O& U "scrubber.end": "MIN",5 c9 H7 D! ]+ p+ N2 ?0 o! o
4 X$ r% u- J; l "scrubber.max_end": "MIN",
9 E1 V7 F* V0 x% |- F' Q! K+ d
"scrubber.subset_last_update": "0'0",8 B2 Q9 v& ~) r+ y: U" s( V2 c7 A2 M
* i0 m9 _/ N2 [5 U1 \ "scrubber.deep": false,' U, n) w9 V) e: o; ^+ v' K
* z! K/ c; ~5 S( C, k
"scrubber.waiting_on_whom": []
! c: A1 `6 ]9 e' _2 H9 Z3 ]1 z* U, J8 {
}
0 ~5 ^/ g* B( q( M" C" V2 C9 Q" Y5 O# M. Z* T
},
5 V% Q/ t8 H) ]/ v+ y2 I6 A1 ]' ?3 s4 ]5 \
{
" l4 r3 p; [. `: l* d( E# U( r: @' n* H6 L3 X
"name": "Started",+ I4 M, ^* p- Z9 t- D7 b$ C3 Q
8 {, f+ K+ B( T7 h) O4 p# r
"enter_time": "2020-07-21 14:17:04.814061"
0 y* T1 P( ?1 D1 z$ H$ {6 e, F
) e% E2 V5 p7 d' n B }9 c9 I; h3 ~. G* l4 \; o) r
8 ~7 j) s5 J& k0 q5 a ],; o) X$ o+ m# T1 V# w0 K" E+ L
* b* [; I' j3 s8 K1 h2 R "agent_state": {}9 G! N" R( l& j( S A: b9 Z
' \! d: W+ n7 [6 T: h
}6 f+ ]) H- S% C: o
/ x* k& F. a: F$ ?2 J: e8 i" Q9 e6 G& F5 h/ g
3 O/ T- t- T# z i, W+ O& P# u
如果repair修复不了;两种解决方案,回退旧版或者直接删除
4 w* L- K1 P' W3 m' H+ [ M0 d6 T$ ~+ M5 B
5.解决方案; R# w* ~) N/ K# e
回退旧版
2 d3 C5 B( c* O( P* N# m% @. b7 c/ ^6 A& ]$ p6 \. e. c
[root@k8snode001 ~]# ceph pg 2.2b mark_unfound_lost revert8 y( ]) O/ X- A: B
7 x* D$ s; J J. `# B直接删除
8 w6 Z2 [' N* i1 Y7 W Z; q: @- r2 h( B Q, X; h
[root@k8snode001 ~]# ceph pg 2.2b mark_unfound_lost delete
5 w' y3 X' u, m0 Y8 X0 @0 K1 J& N( ^; H4 U: k+ {8 H
9 t) f8 J; F$ ?/ f: @6 ]2 `0 _
7 d! `. j& {# T4 P, V( q6.验证
1 G% Q: W. N' J4 c3 @我这里直接删除了,然后ceph集群重建pg,稍等会再看,pg状态变为active+clean1 M+ b$ e$ G( y$ |. v
" c) S) q- J% g* B[root@k8snode001 ~]# ceph pg 2.2b query3 I' `% w# B1 l* H% g3 P
|( J( L* M9 q{" }1 V" n* u1 m+ S8 p
, J9 p$ @2 g8 N! Q8 A( ], C# T3 t4 K0 a "state": "active+clean",
" }1 W! L0 X& d1 c! D) D* r5 o" v& |, w9 S* w* s- m
"snap_trimq": "[]",+ I: K- Z7 U" T! l
! D+ E" [9 W" G2 W7 F0 e
"snap_trimq_len": 0,& t- J) l) v% {' y+ L% d
. t: G6 ]* p% a4 E! l% \5 }
"epoch": 11069,/ S5 r5 |( ?0 _0 p: i; x# L
( f) n; i# Q* r- G "up": [* T+ {& K& s/ Y' e( l! S5 J
; y+ v% ?& A. q' \* X6 U& t$ C3 y 12,; }4 o& x% W% q1 z
, L5 W, i4 H- S) H 22,2 s8 l. e. w' f: T
( W: Y& ^, V. H) X: p0 m+ C4 o f
4+ N3 a, u: ^; Q, y! t) r o
* k h5 N* L) _3 w3 u9 C
],( E7 t* R3 w& }3 V. a
! ^8 w2 ?* n+ T9 w$ d1 |2 m
. V4 `9 Z1 z& N4 ^) J
# C" G6 G8 z G! ]$ M: p再次查看集群状态
, W9 c' j+ {5 J* _8 n7 E1 o1 a. Q' P* P: {5 [, S& m- B" k' b. \
. }! d) @1 ]- n4 `# D7 e* G2 m: P: k* i; V$ V
[root@k8snode001 ~]# ceph health detail
4 ?; S' Z, z+ U3 B
4 |; O: i2 E5 c7 u; K0 DHEALTH_OK
# E& ^3 t4 |- P: z2 X
# _7 E& ^1 |$ j* m: W |
|