- 积分
- 16844
在线时间 小时
最后登录1970-1-1
|

楼主 |
发表于 2023-5-22 17:59:35
|
显示全部楼层
1.查看集群状态
7 g: e2 T" O/ S[root@k8snode001 ~]# ceph health detail
- W% x6 U) ?4 A6 ~' i* V8 U% J! I# o4 H- r6 h0 K
HEALTH_ERR 1/973013 objects unfound (0.000%); 17 scrub errors; Possible data damage: 1 pg recovery_unfound, 8 pgs inconsistent, 1 pg repair; Degraded data redundancy: 1/2919039 objects degraded (0.000%), 1 pg degraded
# T: o( s" }8 u% h6 h. p" c+ J! s7 }1 j9 i' J$ u$ P3 L$ n
OBJECT_UNFOUND 1/973013 objects unfound (0.000%)
% H4 y4 R4 _ F) K1 Z* i
' O! j% _3 O- V: ?0 O& ^ pg 2.2b has 1 unfound objects
1 x8 J/ U& G# l! P" D- @
4 q% `4 O. R0 p! J. MOSD_SCRUB_ERRORS 17 scrub errors
4 l u: {4 h% N/ n! g
& I& o X# R% EPG_DAMAGED Possible data damage: 1 pg recovery_unfound, 8 pgs inconsistent, 1 pg repair# ^! y2 i) B- \6 w8 S; E9 L5 f
( y2 f2 ?; G% N8 H6 O$ X+ | pg 2.2b is active+recovery_unfound+degraded, acting [14,22,4], 1 unfound8 j: V2 h5 W q$ W/ {; B! [) ?
6 w$ x) H2 \' T( E4 J pg 2.44 is active+clean+inconsistent, acting [14,8,21]
n" C. V9 V6 D' t) Z% C5 ^4 Z g; A z$ T% @0 U/ f
pg 2.73 is active+clean+inconsistent, acting [25,14,8]
8 \' r [- l3 w5 `+ l5 i
1 [' x' `* |( G4 N pg 2.80 is active+clean+scrubbing+deep+inconsistent+repair, acting [4,8,14]
6 N e9 \! L9 ^! i: _ F( F
4 d% y; j9 n7 V% W3 O4 d pg 2.83 is active+clean+inconsistent, acting [14,13,6], _# Q2 ~# c" I+ K& k) F
& x* @, C* K) S5 Z
pg 2.ae is active+clean+inconsistent, acting [14,3,2]8 p) Z; Z' F( D( f0 G# Q9 o
: ?! s! x& C6 e4 n' `0 @" ?, d
pg 2.c4 is active+clean+inconsistent, acting [8,21,14]0 Z5 q' l* V9 k# s3 b/ v
4 J! ^( O! {- Z5 C4 N, \: b pg 2.da is active+clean+inconsistent, acting [23,14,15]
- R; d1 O! E, U# d- D7 V$ a9 P+ [
+ s7 G! ] A; r9 }8 Y( J7 v pg 2.fa is active+clean+inconsistent, acting [14,23,25]
S X% Y k( n+ V
# P4 l# g, l/ O1 V: C( k2 gPG_DEGRADED Degraded data redundancy: 1/2919039 objects degraded (0.000%), 1 pg degraded9 J1 T# U3 ^) i3 _
3 i! T" S" C* F+ a pg 2.2b is active+recovery_unfound+degraded, acting [14,22,4], 1 unfound
, M' j0 h, Q3 s& V4 [ e& n% a' _ ~+ m1 a; x* M& e- I
8 T6 \) w0 U p# b! U0 e3 o' I* P, a4 G# n4 F
从输出发现pg 2.2b is active+recovery_unfound+degraded, acting [14,22,4], 1 unfound
) U- M& M0 j3 n
7 G: j' B& ~2 I6 N3 o2 {) z. w M现在我们来查看pg 2.2b,看看这个pg得想想信息。2 m9 o, V" m b' U( o
! @( J% e" V; p9 W+ |* Z" g9 [" B
u% s5 y: {: _3 R7 d
/ s3 f0 H z% N6 d8 E3 x1 Q[root@k8snode001 ~]# ceph pg dump_json pools |grep 2.2b/ I. V( s# R- t. c5 j
$ p. \3 d/ m, m, j2 fdumped all9 E: N0 w4 Z& b' J1 R2 D
4 h9 m" x% B1 N' O2 u2.2b 2487 1 1 0 1 9533198403 3048 3048 active+recovery_unfound+degraded 2020-07-23 08:56:07.669903 10373'5448370 10373:7312614 [14,22,4] 14 [14,22,4] 14 10371'5437258 2020-07-23 08:56:06.637012 10371'5437258 2020-07-23 08:56:06.637012 0
! ~# @' u! x1 C+ L; p, a1 r/ i4 u' N1 q
' t2 W9 O& Q4 A7 S: Y5 D) s4 U$ Z3 [0 X# y" {
可以看到它现在只有一个副本3 H2 ~9 T' S' m) Y- A& o% t, g: e
* {" ?) O0 I+ D) |; W4 x, P1 M2.查看pg map
. w. [1 C/ A3 |8 t+ I4 {* j, T. }$ e4 Y' K# n! a6 A e) N
; {1 K5 S, b1 I8 V
[root@k8snode001 ~]# ceph pg map 2.2b! \9 W4 \9 ]2 `; l+ `2 ^8 E v
. E( r0 \' i7 s8 g% s' n! Mosdmap e10373 pg 2.2b (2.2b) -> up [14,22,4] acting [14,22,4]% N. \7 H R9 o$ J
2 z( [" `+ b' d) p5 V, X, ^* O
; y0 @- u" ~ L$ W
" O% _' {8 ?2 h6 k" Z- `7 P从pg map可以看出,pg 2.2b分布到osd [14,22,4]上) Q9 }$ O" ?2 A+ `) ~
3 ]( t+ |3 `4 I8 E# d$ K0 e' G4 O8 a
. n1 R: A. ^$ @3 M$ I s! Q% P" D- Q
3.查看存储池状态9 G$ t( Y- ]1 l3 |/ \: o- B
* d: s% L$ p7 D. Q7 v" f; t1 _4 v/ _( A
[root@k8snode001 ~]# ceph osd pool stats k8s-1
4 R9 l) l& G8 }+ `; G6 w2 f3 T4 z8 P) v, G5 s
pool k8s-1 id 29 @3 N5 r9 `1 E, k; e
2 j' T, @4 u2 i2 `* D1 Y4 x8 M
1/1955664 objects degraded (0.000%)
3 m6 g0 A [! V; Y9 L
: P, M( u: M# T: ~# E 1/651888 objects unfound (0.000%)/ R2 b5 ~; U _4 o
. M5 Q" j* O4 ~5 W/ J client io 271 KiB/s wr, 0 op/s rd, 52 op/s wr
5 H7 K: o/ J) j. [# r' u# ]! e" Z" X# A
; B4 G! j0 Q6 h: _) r6 A4 V0 ]' y5 m% o
[root@k8snode001 ~]# ceph osd pool ls detail|grep k8s-1
% x; U5 c. e* O2 U) G2 M4 @) l. M- k% [7 J9 Y# R
pool 2 'k8s-1' replicated size 3 min_size 1 crush_rule 0 object_hash rjenkins pg_num 256 pgp_num 256 last_change 88 flags hashpspool,selfmanaged_snaps stripe_width 0 application rbd1 u o3 D0 h- F# V3 e/ k
/ W- P: ~( W$ h+ n2 J9 S
) q/ T' M8 b1 k+ Z( R: V
5 ^/ u( C* J3 L
4.尝试恢复pg 2.2b丢失的块
5 ~6 B& ]. A; Z# @6 [[root@k8snode001 ~]# ceph pg repair 2.2b1 k/ O6 x6 [* v+ X p9 D
# d2 a- f6 i4 w
6 {$ ?# h, J8 Z5 s, q7 {
6 k) v# B& r6 E% c( w( N* G如果一直修复不成功,可以查看卡住PG的具体信息,主要关注recovery_state,命令如下
# g$ B- N! P% B% q6 [: K; C
% M7 E! z, S3 A. H6 H/ q4 K$ s& T2 H' ?4 q3 H6 h/ G
3 l6 b% g' o3 w# w# G; X[root@k8snode001 ~]# ceph pg 2.2b query: i7 E3 D2 K9 Y( K
8 T$ j% t7 m" n/ y{
+ A$ m& T! l, F) h! z+ G7 j* |2 U( l5 s$ }) D
"....... Y& Q( V2 o, j% l+ e- T! \+ L
. X& U! G1 C7 Z- Z "recovery_state": [
1 D9 o# |5 H/ m7 _' |( l' ~
: L% E E1 H; V0 }& b5 A* \ {. h: E4 w1 ~; x* s
0 s, w* v4 v1 x, E& E& o, C "name": "Started/Primary/Active"," q( m2 v% D" b9 Q
9 O( h6 _2 D9 q+ O" e! n! ~ "enter_time": "2020-07-21 14:17:05.855923",
/ V+ S4 g Q2 n: p7 ? y
# {% M9 p( E2 C+ ~; H- V "might_have_unfound": [],5 b0 Z$ m+ _6 Q; F
- }7 R _5 t, {
"recovery_progress": {
1 c$ L" U1 t) b: J
. e v! _: ^! U: l9 V/ K' ?7 }- \& S8 C "backfill_targets": [],# ?' e! k6 m: r
5 e) e- d( N+ z( \% t% U
"waiting_on_backfill": [],' w& s( Z5 E( s4 x0 |3 u% J
' E, H( t% R e$ w6 V, A5 J$ j
"last_backfill_started": "MIN",
! Q% F0 Q, r2 `! x
! e2 I1 m% v- f K2 J "backfill_info": {$ i+ ~, U, [! L6 b; T0 Z
( e D3 q- h0 d- g# e
"begin": "MIN", o( X( E+ k: C4 @
' ]7 N# }4 D8 o6 _# h* v: _" N "end": "MIN",: _5 w u( C/ N+ ]! U
. L, ?. A: S$ v3 n3 l "objects": []
" \, o& X5 q+ ]" I7 P4 `
3 ?4 |9 J8 y7 K% b7 y },
0 l# h0 ]; D8 {
, K( b, V0 ?5 v0 x; A# G+ l" g "peer_backfill_info": [],
$ B- T, A H5 [
. {' y3 X2 o8 A7 E3 Y "backfills_in_flight": [],
' Q5 J+ z, ^; v! K- \& u1 h# l) x b6 h$ m- A
"recovering": [],
# ~! v9 p( D' M# P; L+ w2 Z7 w, l! I( @( q6 ^; S1 t
"pg_backend": {
: Z/ A( Z2 A& Y3 ~; S& s; X
E: `0 | Q( c2 L4 R, E2 O "pull_from_peer": [],( e* C# `1 \4 P& ^* r0 O6 Q
$ K1 V+ @3 k- d6 n! q
"pushing": []
# q8 y& d% y! n6 ]. D) s1 p) \* ] }" @: n
}" I$ k7 ]4 F; |! |1 E: s
7 [4 O$ p0 S# d- t. I; l# N& L& S: f },
) c1 i. G: D" b/ S5 O* X( [
$ N4 S# G" L/ {; q6 S$ V; c "scrub": { g2 i8 T- u8 W# t; v' k& \9 P
) v8 ]) D" _; e$ @. ?" Q" w/ H
"scrubber.epoch_start": "10370",/ C7 V9 F3 L# f2 _8 S
! Y- v7 M# X) m$ m& ?* y
"scrubber.active": false,/ X# I) B8 F. I
8 [" V, [4 e; P7 W, v; ^, m
"scrubber.state": "INACTIVE",
* Q9 v) m6 c* Y) ?0 q3 T* o' R5 C! g% V& d5 M$ a5 X& `
"scrubber.start": "MIN",9 G" M6 ^# }# M6 w% R' H
/ K& G; J: \, x "scrubber.end": "MIN",$ T8 L( J. U' \! s1 I0 W, r
# D9 Y2 P+ A* R7 l7 s: G
"scrubber.max_end": "MIN",
2 K' n+ \0 E) D9 q$ N, N, C5 P5 Y& V
"scrubber.subset_last_update": "0'0"," R% |$ @3 n1 t( m: H
5 Y4 C+ C/ \. {& X "scrubber.deep": false,2 B5 n: F. t0 I, c0 {
w- `9 j( W# G5 _* R
"scrubber.waiting_on_whom": []! s# o; J5 V# z
: T* f! I! I: v) n8 I
}
: W4 [$ l2 r j( B+ q# o# b+ ` ?2 Y9 I
},: ~, H* B3 u$ W2 m- N
& y6 y7 {/ l/ N) q$ D' X9 }% ` {
4 j% m6 s- ~4 p1 K' _
: P1 f7 e# E5 G5 o# { "name": "Started"," z/ r* y: o' E5 `. r: d
% D G( r, x3 P }7 v' I, c
"enter_time": "2020-07-21 14:17:04.814061" ], J" l9 E, r5 f' c2 k' s+ p2 R1 c
\9 c& O/ D$ ?" M. q% C( Q
}
/ Z% x! R8 I- ]" @
; q Q( M( a0 `8 \9 O1 |% D1 v ],1 L }& }+ Q8 h+ q
3 w# ~* l2 S% g; f4 w "agent_state": {}# d2 M# l5 z- M6 h" W. p8 i3 u* _
# |% K5 S! W' x# ]: \8 l}
5 s! @2 b/ d, p, J0 D. L& `3 A8 p S9 `$ s: t* k: a
* k7 m; f. g2 g/ h" p. z6 b! B- q( ?) L, A1 `0 r+ A( X; k( j
如果repair修复不了;两种解决方案,回退旧版或者直接删除! X# t) z: q) G( J
& f1 k. \* w" H3 y1 ]8 U r
5.解决方案# |# e; H" \) p {4 T1 B* b
回退旧版 C" x- u: h9 ~" m/ Y7 R' Q
/ G3 ]& W# F$ W' o
[root@k8snode001 ~]# ceph pg 2.2b mark_unfound_lost revert
. |* `+ V4 V0 b( n4 `6 n
. b+ g$ j7 a* ]0 g1 Z) v直接删除' w7 L* K4 l; [6 D
: K/ y& E% t- S* n+ M[root@k8snode001 ~]# ceph pg 2.2b mark_unfound_lost delete
( \: \* I) M- o4 {+ n1 r& u* _2 r G1 U' F7 Y( @$ e
: t: e- } F% m: O
- h/ I; [, r# T J
6.验证: d; l- O2 a7 O# e$ b- `
我这里直接删除了,然后ceph集群重建pg,稍等会再看,pg状态变为active+clean
7 e6 a$ {2 E0 r( r, E n+ X# [5 ?5 z0 p
[root@k8snode001 ~]# ceph pg 2.2b query
, | x4 X2 d1 ]; m
3 L5 t6 x8 b6 J% \5 F{
# L# ^* g. Z3 u- O* d: X. Z: `0 ^! M; X/ M: j) u; X& J4 U
"state": "active+clean",
6 U: C/ b* C6 \% K0 d1 t/ f, h, Y/ d1 i/ |! T# R
"snap_trimq": "[]",, K" A6 |* d6 k |9 W. A; c3 d
7 O6 c3 ]+ J* Q `
"snap_trimq_len": 0,$ E" l( O4 W! e8 a
4 \, m& j1 L8 u "epoch": 11069,
- }+ d; R( T* C0 F& l6 w( O
6 G0 O0 T5 @' e7 }! W* V+ [ "up": [3 M- s& b' I/ Z1 K
8 L/ y8 q0 j0 }$ ~. H5 S" J4 a 12,
" e1 I3 N$ ^. H/ c5 _
4 X+ r6 O' V" N6 k2 _ 22,) @' D4 U: @* y% a( @
7 c6 A+ _( W) _9 ` 4
2 Y& M$ y; W+ J. m0 b0 U& M1 J* f0 m
, \: j( G9 ]! _- Q- D9 k: t ],
( ^8 C" v1 C# G; I) s; y5 ?
2 ~: {3 u7 p" u; x( j( y$ I; u: w" g. d0 z/ k* u
) L( f6 F/ y7 p0 Z+ s再次查看集群状态2 c( [, ]' m7 _/ p r8 h
* M$ D, b1 E$ p s: Z" x. S: @
* c, O2 ]1 u6 y% G1 t/ q9 w$ e4 [, c3 X: f
[root@k8snode001 ~]# ceph health detail
/ {( f: r. l& K' i3 M6 H+ C6 ]6 ~& h# c5 w" R
HEALTH_OK* }* }, H7 h' D
- X4 s! t2 O. G4 p' w: W
|
|