|
|
楼主 |
发表于 2023-5-22 17:59:35
|
显示全部楼层
1.查看集群状态: m, g: V d5 ]9 b C
[root@k8snode001 ~]# ceph health detail' e& ?; S- R! W# k1 e3 J
/ R5 T: {: v4 @9 {8 e+ L. o* V' e3 UHEALTH_ERR 1/973013 objects unfound (0.000%); 17 scrub errors; Possible data damage: 1 pg recovery_unfound, 8 pgs inconsistent, 1 pg repair; Degraded data redundancy: 1/2919039 objects degraded (0.000%), 1 pg degraded
$ f0 F# f8 c) t0 i, s3 X; x7 B$ A. Y1 d ~- ]2 Q5 n3 \, e2 z H
OBJECT_UNFOUND 1/973013 objects unfound (0.000%); `7 X! D s( l2 K' t0 F
" i, [. I; w) |' q pg 2.2b has 1 unfound objects
; O& k+ Z1 h0 i8 |6 B4 \* r' o" k$ k; A- v" c
OSD_SCRUB_ERRORS 17 scrub errors
- ~' t' d0 }- h8 F
, C& S. z/ w4 y3 w b2 d% vPG_DAMAGED Possible data damage: 1 pg recovery_unfound, 8 pgs inconsistent, 1 pg repair" I H u+ d+ P
# x; `& R/ U4 Y" Y+ k. ~* y8 U5 i
pg 2.2b is active+recovery_unfound+degraded, acting [14,22,4], 1 unfound
8 U2 H7 @, b6 b: m% V4 R3 I' s; L( o4 h$ _8 S
pg 2.44 is active+clean+inconsistent, acting [14,8,21]
6 r3 K& W4 D: y2 w6 Q+ N+ X
$ a1 W7 ]5 p# |" q% n2 ] pg 2.73 is active+clean+inconsistent, acting [25,14,8]% X! [( \: q( S9 L* T5 w+ x
. @* u# d; d9 ]7 i& `0 c& v
pg 2.80 is active+clean+scrubbing+deep+inconsistent+repair, acting [4,8,14]
[% w! v: S2 k0 k
$ o" k9 m. r+ f5 F pg 2.83 is active+clean+inconsistent, acting [14,13,6]/ R' D( l0 S* s. E
, Q2 Z: z/ q' c9 p e7 m% P) h- s
pg 2.ae is active+clean+inconsistent, acting [14,3,2]
4 }5 m2 a" `% C) M! } p! C' L+ x7 C; R8 X' Y: V7 T9 }
pg 2.c4 is active+clean+inconsistent, acting [8,21,14]8 [3 I \' K0 L( v9 V% E
! F7 @6 t0 Z0 i4 p6 y. b
pg 2.da is active+clean+inconsistent, acting [23,14,15]/ w; l4 Q+ K2 d2 i- x
; [: u7 t5 ^0 D$ F, l" J# Y9 p
pg 2.fa is active+clean+inconsistent, acting [14,23,25]6 W5 g# V! `1 C- P9 U
/ \/ E) d$ f4 w5 ~PG_DEGRADED Degraded data redundancy: 1/2919039 objects degraded (0.000%), 1 pg degraded J: P& Z% ]1 J. k
+ B3 D9 N, R% Y& U pg 2.2b is active+recovery_unfound+degraded, acting [14,22,4], 1 unfound
, V+ J. u: y7 G. s5 S
6 F# ?* d1 ~: G$ h
) J9 _) g! k' D7 u& s7 ~
, c4 {6 H" k( p: q& w从输出发现pg 2.2b is active+recovery_unfound+degraded, acting [14,22,4], 1 unfound
" f0 ~; U- b4 N( l9 n9 {7 F/ R. @( a
现在我们来查看pg 2.2b,看看这个pg得想想信息。
1 G" M6 |4 k7 o: @% {# X8 X* ]9 E( a) k4 B
$ f$ m+ I' R: f, ]
! q9 e, x; X( R( F[root@k8snode001 ~]# ceph pg dump_json pools |grep 2.2b9 V: a% @- @- |$ M
$ M! I6 j! c3 c4 v
dumped all0 L' s- m) _7 P) v, S7 k% N
% s0 A }4 K, q; Z4 g; |
2.2b 2487 1 1 0 1 9533198403 3048 3048 active+recovery_unfound+degraded 2020-07-23 08:56:07.669903 10373'5448370 10373:7312614 [14,22,4] 14 [14,22,4] 14 10371'5437258 2020-07-23 08:56:06.637012 10371'5437258 2020-07-23 08:56:06.637012 0& M" I. L+ L' [" @! R
$ q5 a5 E6 z* f9 } {9 b% ]7 w4 X
% Y+ H9 H% u; c6 Q/ y b r/ C5 X1 M! e% [* X" S- u
可以看到它现在只有一个副本
- W: u. ~. B" e$ M( Q
9 q; e6 | V9 P- h' M: t2.查看pg map
' ~9 c& f4 m- S, R4 {* @# l; X, ]
/ g4 S' W$ ]1 x* U
( x* c: y, m( ~! k/ P, E[root@k8snode001 ~]# ceph pg map 2.2b# ]3 s9 W1 `: V2 [ n1 y
2 g, f& [( {: b( Z; t! {2 ]
osdmap e10373 pg 2.2b (2.2b) -> up [14,22,4] acting [14,22,4]& ~8 `0 F/ I8 x# V# d$ s/ [; B: y
. z# ]( z$ X3 y+ C! H1 o$ c. ^3 P, d! F$ t& x3 {' [
3 P/ ]! s! X2 w7 ?* a! V* ^
从pg map可以看出,pg 2.2b分布到osd [14,22,4]上$ S# I3 W. |. s. @5 N: k& @1 j% k6 D8 _
% s/ r8 \$ m& w
! Y% X1 N& L: F- I/ H8 _4 \* c
3.查看存储池状态# H- k+ M. e% U' o5 A
# a9 [! c$ H) `& l: Y/ e- x2 N
, b" @2 g$ b1 v2 I[root@k8snode001 ~]# ceph osd pool stats k8s-1# H; [8 O* m: k) ?6 D0 Q/ J9 S
- H# F: m1 R5 z2 r
pool k8s-1 id 2. ?4 c4 ^/ L( Q! t Z6 V
. }; h9 A* A1 a2 F5 ^6 b2 U8 a 1/1955664 objects degraded (0.000%)
+ k8 K' V; N2 L9 H) a: \" B, h
1/651888 objects unfound (0.000%)
( u; u* z h7 f9 c+ [8 }4 r( P8 k. a+ I, W& G8 a
client io 271 KiB/s wr, 0 op/s rd, 52 op/s wr
$ D! V4 i7 M6 D- ~: A2 N: A' s5 b( ]/ \/ ~* s+ L
' E9 s( u% A9 ^9 g! ?+ R7 ] O8 i) v3 B L
[root@k8snode001 ~]# ceph osd pool ls detail|grep k8s-1
5 B8 Y; p2 y, i6 U" b$ r P0 o7 \5 k$ k: I
pool 2 'k8s-1' replicated size 3 min_size 1 crush_rule 0 object_hash rjenkins pg_num 256 pgp_num 256 last_change 88 flags hashpspool,selfmanaged_snaps stripe_width 0 application rbd0 \( s: ~9 b! G- r$ q [: v% j0 H
7 N( K3 b+ p; [" Y: N4 e# P# R
; A% W- V7 y5 p' l* l3 c* o
2 |. f* v- ^1 j) y2 D$ s4.尝试恢复pg 2.2b丢失的块: c4 U% R' R5 P
[root@k8snode001 ~]# ceph pg repair 2.2b
; w; w7 R3 d. }% L) k4 \1 V% G
$ ]) f! }4 C- g+ k h2 P, u* W) g
x" b% P, Z& D4 J$ r# ~# p
/ s7 f4 u; ?; g$ w8 u- y+ a如果一直修复不成功,可以查看卡住PG的具体信息,主要关注recovery_state,命令如下
- Q9 C7 ^( P- H: A9 n4 G3 E& `7 k; r3 V+ c
# W) x2 z) T3 [ x: P; w7 c) b; D4 k3 b
[root@k8snode001 ~]# ceph pg 2.2b query: B( l+ g+ D* N5 L# O6 c5 r$ ?
9 Y. S% ~5 h* Y. R- K! q9 u
{
5 V5 o0 J; T2 e1 i3 k* e, \3 K6 q- C
"......
3 Q) P3 k9 p6 a0 v$ U# {/ s; N9 [! C, ~) T4 H
"recovery_state": [( m; ~9 B9 y5 k
: J% O3 a4 { u6 Q6 l
{/ r+ B* c5 \+ e
$ r2 I& D+ `* h2 F9 O "name": "Started/Primary/Active",
2 C8 x6 r# y0 f1 l
8 E1 }6 E3 e5 }, V0 J+ h "enter_time": "2020-07-21 14:17:05.855923",
# R! S1 _4 y+ g, O( m
2 B+ H( D0 p, c2 p "might_have_unfound": [],$ I! z+ k' Y5 }6 ^/ T
' o/ @0 k! ^! ^5 K
"recovery_progress": {
: {4 m1 m& A9 X
9 z" |) K8 e8 e/ \8 ?2 n "backfill_targets": [],! Q; E: u+ ^. m0 y( c2 _6 b1 ]
3 k- A! K3 T% }$ H; Q( v "waiting_on_backfill": [],
4 `+ |0 q% \; D( l! s2 i& e* p6 W: D, \: ^2 b1 y
"last_backfill_started": "MIN",
: a% Y& d! U u) ~3 i
, Z; P' {" |6 ^% D) P "backfill_info": {' }: R% t) W1 B! M- C* A7 A: X' f
2 I# X/ X1 B+ A7 r5 N! p' J- [ "begin": "MIN",3 W6 h2 n, Z! d$ O! P" Y- Q2 i7 W4 ?
* J6 H7 J" U6 S; e
"end": "MIN",
* [, N# {0 a1 M) K( | L+ ?1 a! H7 q9 \5 W; I
"objects": []
o9 g4 ^1 w" n4 O, W; e# P% |+ t3 W# }* k1 S- |9 S
}," f0 Z, V' c9 L& u3 ]5 o- r, W! b I+ x
- a2 p, V' }4 z "peer_backfill_info": [],
, }3 P0 m" g" U. U% w% @+ I7 ?
/ u5 V' c& u6 F9 \! {% c "backfills_in_flight": [],
" x1 s3 [" d3 h; o
1 T4 u% s/ [; B "recovering": [],
* @3 @( o! S$ T4 b% q# U7 Z# ^1 v. i3 z& e# M3 x
"pg_backend": {
|( I' H$ E8 f% N& E" w+ r' [. A2 D2 x; P8 c/ t, D
"pull_from_peer": [],. @7 e# R4 |7 R# V# `: e9 x
( J @1 k" _6 i- X/ B+ }$ T2 w1 W
"pushing": []
( ~: _1 y D4 v$ {6 Q# l4 x; `' G5 _! H9 G, x0 P, { ^/ N) i
}
0 Y7 b7 G$ N4 `% O
% W1 m- Z S, u! R },, ^/ f) Y+ o% j
c, ^ a" ` x8 i7 Q- \/ |
"scrub": {9 m0 ]) L; A* U- @( L, t7 U/ l
9 ~3 K9 [4 r3 Q* @9 i) F/ P/ |
"scrubber.epoch_start": "10370",6 ^+ t, D" I/ T2 X" b
* d- z1 |& B& I/ o+ b4 y "scrubber.active": false,
0 H8 j, S& b# B+ \1 F# f: v: `, N* z# X2 |1 S
"scrubber.state": "INACTIVE",& z4 Q Z! w) Y0 J& f! q
" M+ Y$ S$ |$ z9 V$ M "scrubber.start": "MIN",
+ N ]6 F5 Q, J% O9 y& \$ m
3 P# o2 |2 E- @2 r3 G0 M7 v# \' O "scrubber.end": "MIN",7 K4 B. ]/ T1 C( v
4 Y" [, f3 F* l4 A
"scrubber.max_end": "MIN",
( P! Q/ l% m: |3 p
7 Z9 j# V5 b* `4 j4 l "scrubber.subset_last_update": "0'0",! r& S2 h% O& a1 Q$ G0 Q8 S
6 G$ F# T/ `4 `. v "scrubber.deep": false,
* J: X% m- l- I0 s% a- V0 e* [& B( R( n
"scrubber.waiting_on_whom": []
& P5 Q P0 O5 \- j/ ~; U0 Y5 U) c2 A% A/ |( l N
}
3 a. U: ^8 ^- M# G) k
; H ?5 l" A2 | },- B8 Z" W) K. o3 M( i/ S
( c! U6 Q6 x2 l) \; @! {
{
, ?9 C1 K& `" p* a/ g6 h
0 D# Q4 g* r; @2 {* F7 n "name": "Started",7 T; v! k8 ^2 O2 I7 Z! V
2 f) f0 i8 T$ F7 b4 S
"enter_time": "2020-07-21 14:17:04.814061"
( Y3 S% \) ^1 `8 M- R6 b1 {
( M$ L& s* a2 J# {$ m }
: [ G5 P9 o$ T$ t/ y) a O, ~$ ~7 p* Y
],
, q$ z- l6 E! r# S2 w% j8 W
, j/ M; b; u5 p* W: W "agent_state": {}
3 n8 `; X) j' H V( h* E* \6 ^# \
}( p" P, ~5 A( u3 B4 X4 m# I* e. r
+ {8 Y P3 M- w& x! U
: G; a& y4 j+ k3 g. ~% N: ?% A5 B$ e) V! t( r
如果repair修复不了;两种解决方案,回退旧版或者直接删除
( B! a: u2 e4 M3 @- |+ G+ h/ y: ?
) f# w/ r0 N( Y [& S5.解决方案4 Q( d' X$ g. d+ ^
回退旧版
K3 b! n: _' ` k, j# ]
0 B: [( ]& }& L; t6 G0 n1 D) p[root@k8snode001 ~]# ceph pg 2.2b mark_unfound_lost revert2 `3 m! f$ C, D* F0 ~$ x9 _
' A8 \* N1 O/ K2 l# f直接删除* _# J" }- P# Y$ s$ N
" C4 N7 O, [4 t3 B% m9 w* N[root@k8snode001 ~]# ceph pg 2.2b mark_unfound_lost delete$ J/ \5 e: s9 o3 X N2 \, b
5 I* c6 B- A; m
( y1 a% M1 t" G: [! ?, _/ L: ]( x" j# z) G! [
6.验证
7 W! E8 \7 @, h% K [9 H我这里直接删除了,然后ceph集群重建pg,稍等会再看,pg状态变为active+clean
% J2 f( y- Y% o
; s- A; ^1 z I6 @4 }[root@k8snode001 ~]# ceph pg 2.2b query
; h/ R9 u: d, L7 L3 n
* C# i, Q: a3 n) a, _& w& O{
' R! H% D; v/ b; T8 g3 o* s7 ?; d% Y
"state": "active+clean",8 w K4 H$ J v
. T. r( f' q" [& d "snap_trimq": "[]",
. ]* }! y, Y$ `2 r& h
, w8 j$ Z1 X5 `" F1 n "snap_trimq_len": 0,
! W. s" n" e* f9 q, l) L8 S! p r6 j. M) B' }
"epoch": 11069,
3 B0 F2 i4 I0 u" I; V( P7 t! A( K8 E' T7 T( V5 f; m9 O& u- I
"up": [
; a! V: V# @% I# f& X5 t1 j
1 J. F; [, U1 I2 E 12,
! O7 D, g; c1 Z0 s* M. ^+ x# ?6 Z+ @+ k5 i+ {/ [8 @/ w) ]
22,
% y! G3 E6 Q: H1 R' ~% B# d9 v! S$ N
4
) P o T, {, M7 U1 |! p( G, s
# A _/ `5 b5 x" O9 h ],
6 A7 o* X3 U t4 c- V0 z2 G, O+ o2 @; c7 Z* f
1 i8 G a( N2 N1 a; b7 \& y8 ^, G8 L- ]2 f% D
再次查看集群状态8 a- {( L- ^& s7 X% E9 J/ y
7 W$ Z) A0 W; V5 E
8 }# A- D& d9 l7 F, D) L( V* u/ b) o q3 W8 g* h; Y
[root@k8snode001 ~]# ceph health detail
$ @1 n, W0 `6 y; L$ M
$ \: ~- V" z* D1 u6 RHEALTH_OK- I3 {9 ]2 d6 g# _ W: D
: s2 o; E- d9 T1 {8 s# d5 ?
|
|