|
|
楼主 |
发表于 2023-5-22 17:59:35
|
显示全部楼层
1.查看集群状态' w! h: f( h8 M( }/ V
[root@k8snode001 ~]# ceph health detail. ?8 D! p' \, B5 p- Q
. P' q) i6 }, Z. o; Z( m
HEALTH_ERR 1/973013 objects unfound (0.000%); 17 scrub errors; Possible data damage: 1 pg recovery_unfound, 8 pgs inconsistent, 1 pg repair; Degraded data redundancy: 1/2919039 objects degraded (0.000%), 1 pg degraded
. U) j% a4 R( F0 k2 l% @! M6 Z1 C' D
) w2 D. Q9 o' q8 y$ a% I- rOBJECT_UNFOUND 1/973013 objects unfound (0.000%)
$ n4 t& Z% N) R g# ^( l+ y# g7 i3 S8 h, \) d
pg 2.2b has 1 unfound objects9 }2 L% e5 Y; l
7 A/ @/ ]7 E) t" ^OSD_SCRUB_ERRORS 17 scrub errors
% O0 {: @' p: O
0 l* F- G; e% O( D1 P1 b% @PG_DAMAGED Possible data damage: 1 pg recovery_unfound, 8 pgs inconsistent, 1 pg repair
6 X+ P) y$ }/ ]0 }. [% G2 R
. [" D4 u; F; i6 H pg 2.2b is active+recovery_unfound+degraded, acting [14,22,4], 1 unfound
6 A e( M% Q4 }7 b5 I0 I$ m6 g. i! T' @" s, y
pg 2.44 is active+clean+inconsistent, acting [14,8,21]
5 \0 E0 E$ T' A; }2 ]3 ?7 V* [' e; Y% \7 A
pg 2.73 is active+clean+inconsistent, acting [25,14,8]
9 ~, O, I" z: p3 Z3 S! w- N# |5 {0 u' q
pg 2.80 is active+clean+scrubbing+deep+inconsistent+repair, acting [4,8,14]/ H0 U- k# t& Y
) }4 [+ W( x% q
pg 2.83 is active+clean+inconsistent, acting [14,13,6]0 S) v0 _3 c3 P! `
/ G* v4 ^0 r) K1 @! ]+ G6 _! I
pg 2.ae is active+clean+inconsistent, acting [14,3,2]: x6 l7 X+ a0 Q0 T5 o: l) n
8 W2 m3 T) y! u- ~ pg 2.c4 is active+clean+inconsistent, acting [8,21,14]
" x6 O u" t* U! S3 l8 b8 ]; V5 \% ?: j
pg 2.da is active+clean+inconsistent, acting [23,14,15]
& u( I7 v/ C0 I, v# ?8 K1 f
7 L2 M5 S( N- x P pg 2.fa is active+clean+inconsistent, acting [14,23,25]
# C/ D/ h/ [4 l/ |! S, a0 A/ _1 {, N) R
PG_DEGRADED Degraded data redundancy: 1/2919039 objects degraded (0.000%), 1 pg degraded/ }3 ?6 [) C. e1 x5 F
; t5 m# w7 G Z$ `3 v pg 2.2b is active+recovery_unfound+degraded, acting [14,22,4], 1 unfound
5 ?+ s* \) y' P2 O+ `- A8 o k$ ^ y" O- K3 h ^5 v
0 b) v9 G# R; b3 o' V1 c" Y
0 h0 T6 B- b% X8 l3 Z$ u5 K4 L从输出发现pg 2.2b is active+recovery_unfound+degraded, acting [14,22,4], 1 unfound- m8 U% N: I' _; p
$ Z$ n1 ~8 V9 V6 M4 t! O
现在我们来查看pg 2.2b,看看这个pg得想想信息。
' p, r% M, d6 M- O3 w; ` k# u- h! Q8 X% E. R( p: H
9 W3 _4 r8 Q. I, b1 c( |
8 R& `6 x* B/ l/ |' d1 Q[root@k8snode001 ~]# ceph pg dump_json pools |grep 2.2b; ~1 |, r' w- A' j) P
r3 E- o1 e: \# _/ |' A8 gdumped all
# c- a7 ~: y" F, j
2 S' @8 a1 ~4 Y# m" [4 d, K2.2b 2487 1 1 0 1 9533198403 3048 3048 active+recovery_unfound+degraded 2020-07-23 08:56:07.669903 10373'5448370 10373:7312614 [14,22,4] 14 [14,22,4] 14 10371'5437258 2020-07-23 08:56:06.637012 10371'5437258 2020-07-23 08:56:06.637012 0
( g( R) H" @$ y8 _' D& d& a0 U+ {& |% D2 \* I1 }# X
# u) `, Q/ L( D7 z6 e/ S7 F2 d6 I
9 L' W( w* l% O0 g; z/ ?可以看到它现在只有一个副本 X" R. j' p. J& T# X" T
* v( s, s F/ Q* n0 }
2.查看pg map. q% |2 J2 |2 Z U, o$ R* s+ [0 h6 c" f0 s
$ D% f' N, O1 _2 a* }) L
' U! n. [) ~: S# |1 F
[root@k8snode001 ~]# ceph pg map 2.2b
7 u# k, N8 K1 t' D- a! [3 ~- E+ r( G0 N6 J; N! Z' F/ d0 W7 V5 \7 s
osdmap e10373 pg 2.2b (2.2b) -> up [14,22,4] acting [14,22,4]8 @8 ]! g2 s- m. T$ t# J
2 ~ P6 j6 V; \0 ~8 z P# x6 \' s
5 \$ z$ S% A3 {1 N
7 S& `$ W3 ~( [9 c: j4 U
从pg map可以看出,pg 2.2b分布到osd [14,22,4]上, @+ w0 P" X& M9 b& P* d. i! C
# G2 t/ g% U2 b# W; {6 P* e5 G
. H" k; @! [1 z* x3.查看存储池状态5 k( J3 X0 X/ ~) c; P5 I
% \+ o/ W9 q) i) P
5 b9 C7 t. k: U
[root@k8snode001 ~]# ceph osd pool stats k8s-1
, U" b" U4 X( V8 S% G+ G# c
2 K8 r; z" \+ ]pool k8s-1 id 2. e; G# k k+ V/ W! P1 \/ w
& @6 Q+ j* ^6 V" N1 k6 @9 f 1/1955664 objects degraded (0.000%)
6 Z8 ^- ?. s: k- U, R7 |" }
9 W4 a" o/ H7 t# K 1/651888 objects unfound (0.000%)$ k! K) Q. k- j* c
9 V/ D; ^8 P$ |5 C: ?( b7 Z client io 271 KiB/s wr, 0 op/s rd, 52 op/s wr
3 W# f; S6 n( y3 b8 o. N* D% V2 Y0 v+ K+ ~
9 D0 h4 ?. ~2 a' p6 _! t5 o; g( X! I3 o% w
[root@k8snode001 ~]# ceph osd pool ls detail|grep k8s-1* a1 R! \2 ^8 Y" X2 ~! F
2 D$ \( C0 H" G9 \pool 2 'k8s-1' replicated size 3 min_size 1 crush_rule 0 object_hash rjenkins pg_num 256 pgp_num 256 last_change 88 flags hashpspool,selfmanaged_snaps stripe_width 0 application rbd; l/ y$ z7 v! f( p( h9 J- `; Z
F* R$ K S7 F/ Q' g1 b: r+ B
* U- O9 X8 @/ s& {) {$ i
! V8 l: n3 J" _5 K4.尝试恢复pg 2.2b丢失的块
) b# u9 I0 C3 h1 A* ~6 d[root@k8snode001 ~]# ceph pg repair 2.2b
7 N0 `# l) Z/ e
# M x% B( C+ O7 C! v) M8 N" P- U4 [2 Y. W& V* g) x
& r+ [) j( c) n如果一直修复不成功,可以查看卡住PG的具体信息,主要关注recovery_state,命令如下
& y; }7 p9 p# i% m7 K$ g& Y2 a( `- o% q
& W; R$ k: Z( S$ y$ |
6 C% H: y9 R# y V0 \
[root@k8snode001 ~]# ceph pg 2.2b query. d7 w" R+ o2 F6 Z
# D' m. X, F/ r{
; ?& n' i( R, z% ^8 z3 |+ }( B6 R
* }7 u( [# N: `1 L& r "......
* z, c7 P0 T. [) g$ |) B
) D/ R1 o# }1 k! _9 D* Y "recovery_state": [* {6 h, U: M/ z( {. k/ [; P5 |
- D2 N2 ^) K4 J' b8 c4 ~. [: S {
, D( o' {# j7 I& s1 D" D$ S+ P* P6 u' a; r
"name": "Started/Primary/Active",
! H( g& N, I. W7 a4 p, B# E3 z1 ]: K8 a6 `' j, v
"enter_time": "2020-07-21 14:17:05.855923",
3 ?" s: ~% _. v& c/ C! `! w
: z4 a# `. _: n' E; D1 _5 S "might_have_unfound": [],, E |$ @1 Q& d5 r+ y. ]0 p: T
T' O: h6 E' r W& f6 l" H6 F% k "recovery_progress": {$ s; f# N' K4 Y7 x5 \' w
, m6 t* r7 I- A "backfill_targets": [],* I+ H, b# j+ Y8 W+ b
5 C, @0 q" b2 n4 v( W8 I "waiting_on_backfill": [],7 `* n {& S) e, D% |+ a' Z7 I
! N g& U' E% b& c "last_backfill_started": "MIN",
" Y ?5 k5 x, f* B- G! ~! Z6 b+ O) J; e2 m S
"backfill_info": {
) G( Z5 t6 C8 ^( n
. n* `% q# b3 P "begin": "MIN"," I$ u. ^8 a8 p* U+ b
( y! W9 A$ N* A* k. E5 Y
"end": "MIN",, P; m6 H7 a* {6 B! G$ P; q9 }, j+ s
) C+ a/ p2 a4 D' \) [& {
"objects": []# f" |7 p# q; j) L# Q5 N9 h
9 R5 m5 F* i8 S' v/ P+ s% d( K },/ q3 e* m7 _+ @( Z
7 ^% h" a% ~ b: Z7 c4 B
"peer_backfill_info": [],
1 _1 e7 j$ ]& c: p/ x
7 @& w6 ?. h0 }) i "backfills_in_flight": [],; O, `# M- ~# e) Z6 [9 I' ?" a7 S
8 `3 B m) T* J: y" L+ M! r "recovering": [],
3 b& M7 X; X' |' T1 u- \% b% U# d
* E5 R' u4 {5 a4 O$ J "pg_backend": {- e: ]% O$ O" z1 ?& u W, c4 I
% a' q+ M" E% O% ? "pull_from_peer": [],# ]# o: O7 p2 @0 B% d" x
8 w1 n; \* D0 L* q3 w' n; |9 v4 b "pushing": []2 z' q) n2 O0 N) A
( Z7 |, b) O |2 V8 P9 I1 d9 g1 f }! r( {0 N# v& R% d
7 H. @# B1 M* f& o! x3 u },3 B& a+ y7 m; c
! `, I2 a3 \, p5 @
"scrub": {
% h9 W+ P, I: R* {1 w, C8 u5 H7 |4 k' p/ R) m
"scrubber.epoch_start": "10370",
/ i E6 ]$ H" W* k4 y" J, n) A
; D4 |/ m$ \. _6 D7 d0 s; z5 n "scrubber.active": false,
" c) m! P$ _! Z T0 z }6 u, t- h2 a! k* `9 K
"scrubber.state": "INACTIVE",5 t7 U% n- X( n2 H5 R0 J& v
: n. L. I G( A
"scrubber.start": "MIN",0 y. D- k3 r4 u, U p5 ]' H
5 ]9 V% n/ H8 E4 n
"scrubber.end": "MIN",
3 y2 ^ B, h2 E$ k5 D5 f8 f
+ P0 J4 k. L9 w0 ]+ h5 p: | "scrubber.max_end": "MIN",, u$ o" j$ |( \: @& f
) M9 G9 F$ a9 f& B/ n7 ` "scrubber.subset_last_update": "0'0",& N( Y4 ^8 u; a* F
4 A$ E: c6 r6 h: W- P; i" J# i9 N" m
"scrubber.deep": false,
# W) ]/ Z3 I; C0 k3 n
" L Y5 w1 L/ t "scrubber.waiting_on_whom": []; h& T# j4 V- d2 o/ Z% m
7 b0 n( z9 G- E2 _7 {3 h- d( s }
Y5 m9 L9 Q s: @' J% E4 m8 d9 ]; A$ }( c0 X$ J4 N+ p. F
},# C$ \# n ?& s! Q& _0 f) ?
: U" V2 i4 ]$ S4 s% m: Z' h: R
{
& |/ ~6 a9 L0 O5 H4 C. _* N+ ] Q. s) }! U9 Y' ^% X6 G3 l& w7 ^
"name": "Started",- u+ X" V" z: ~) q* N) Y
/ q5 g$ ^2 Y/ e. @0 A9 w "enter_time": "2020-07-21 14:17:04.814061"
: D' M7 |7 G* i# ]- ?3 j( ?( H, K/ |/ d
}
9 }% I( d* ~0 s
+ H* k7 D; ^7 u; V ],
- T) O8 t4 ? D( k7 U, o& ]
9 X. W2 a6 w/ _1 y' C5 ^; t: S/ l" Q "agent_state": {}
% T" W1 h4 G0 F1 m% [2 k8 L! |
+ H* v' N+ \, |% ~3 f Y}
0 }9 ]8 p! x% Q4 Y) W
% r. g& i+ q6 f. i, k* p2 k6 Q* k
. R* S/ a! }8 b+ C, N1 Q* X! V- g如果repair修复不了;两种解决方案,回退旧版或者直接删除1 ]2 w! s# l. B; Q5 d, y6 i
. d& Y3 Z6 z# y5.解决方案# T* @+ U P: n9 B1 ]9 a
回退旧版
: _1 q, g7 f1 j- ]. v. J3 H) l* E5 I4 [ U+ L1 q+ r1 E9 }3 s% u
[root@k8snode001 ~]# ceph pg 2.2b mark_unfound_lost revert
( f$ d, H4 ]# S9 {
8 i0 T- x0 r1 V+ b Y3 ^( v" D6 V: X$ x直接删除4 L( J- [' o0 W5 d L) ]6 F7 ?6 v
7 u* m5 x6 F" E$ n# \3 s[root@k8snode001 ~]# ceph pg 2.2b mark_unfound_lost delete
- a* G- ` c. V% k( }) f
1 ~" v; e! K/ t
' e" a2 R9 n9 {; V0 r8 h( a
! H) {0 O" X m* M7 F) {/ ?6.验证! `9 U. u9 r% Q B2 c: D
我这里直接删除了,然后ceph集群重建pg,稍等会再看,pg状态变为active+clean
9 S; D7 {$ ?: [' K7 G1 b. g
0 \# L0 n( t) n3 l2 \! q! m7 Y$ P[root@k8snode001 ~]# ceph pg 2.2b query
& S4 _% V3 d6 G: R# U: T& l' {+ |7 Q. O2 v3 L
{6 i' T; h; F; z& i- M9 z
7 v! z4 H6 q# ?( ~% j9 L
"state": "active+clean",9 ?3 y8 k/ H4 E) _6 i2 r" [0 |. V
# \8 z1 x& f+ R# w. ]3 c9 L "snap_trimq": "[]",
5 V# J0 c$ B$ H7 `* u0 r; j9 O4 ^+ i( q6 y
"snap_trimq_len": 0,
3 \4 }& Q* l( M7 R
5 P3 n; B8 U1 L "epoch": 11069,
8 i1 R2 K s; m- K2 u1 l
9 r V. s/ H1 n- T5 f+ B "up": [
: L* {5 ]) P& z. M M& k& p- e2 X, e9 M1 v; d
12,
, M9 n1 Z1 O0 o, Y1 g8 H8 B& S, M1 ]6 x4 O- x1 p
22,
3 t& }$ ^0 x7 d6 B
, r0 v# J: n" G- M( j3 J! G 4
" w$ z) m: w2 Q$ h6 Y
% x8 f1 f. Z; k+ y: R% e ],4 ?0 u. n- [; I
2 t$ }* D P) N" [* j. |# h, j: R' A- x. P! l! n2 l
1 u! _% e$ G; v" F0 N8 i1 E! L再次查看集群状态
! @' W, S" l: E3 E, |3 H3 m7 R. s% M" u# ~& G5 q g' c; p! f
& X. F, W v' V/ V0 g1 ]( _, B, A& Z
6 M* g2 L, B# W0 E% L& Y[root@k8snode001 ~]# ceph health detail
9 z. O! c4 f3 I2 D) {+ x" h' r. M7 B! F7 ]- t
HEALTH_OK4 O/ R4 W* T, \+ f0 o
7 A( \- |% p2 ~ |
|