找回密码
 注册
查看: 4245|回复: 1

Degraded data redundancy: 1 pg undersized ceph status状态异常

[复制链接]

1

主题

0

回帖

12

积分

管理员

积分
12
QQ
发表于 2021-6-9 15:00:16 | 显示全部楼层 |阅读模式
[root@controller ~]# ceph -s
, O" i) }9 J. b+ m. B/ h; f# E  cluster:
& \1 s* l/ N$ {1 B( d# K& o$ T    id:     a4bb5236-c8ca-11eb-a67b-000c29ad02de. B' l0 g! L  D+ Y, F( [
    health: HEALTH_WARN
2 V9 D5 O( }# I            Degraded data redundancy: 1 pg undersized
, F0 u6 N* K9 S) z
6 R/ y9 V8 k# L" U3 Y6 Q% v  services:- f3 k5 P$ v& z$ ^
    mon: 1 daemons, quorum controller (age 87m)2 J; [% \7 L6 n
    mgr: controller.horbtx(active, since 87m)6 p9 h3 @& p% U, `5 `$ l4 B
    osd: 6 osds: 6 up (since 6m), 6 in (since 6m); 1 remapped pgs9 V8 Y: R3 a+ m- E; R- {7 _1 [

+ r6 a! @* D2 P5 Y7 U4 K+ G# h! y  data:
  x7 M6 e0 D+ W" _    pools:   1 pools, 1 pgs
7 y- r# l' |" @* E! |( C" {    objects: 0 objects, 0 B, t8 U) W* y4 ]6 E7 r- _- o
    usage:   6.0 GiB used, 114 GiB / 120 GiB avail
" @9 c/ s: @! e+ `( _% i( C    pgs:     1 active+undersized+remapped
. A, U# y" m4 \
, `% I  g1 a0 _" l: A解决过程:7 x" j. E; Z; V$ E) B1 k
/ \0 b9 k# r1 e4 l: ~, X, G) ^" _) U
[root@controller ~]# vim /etc/ceph/ceph.conf
5 J9 m: h; f/ k% a/ w7 f3 v$ F0 A4 L# e  J" `. m6 R
  osd_class_update_on_start = false
( ]9 W1 j8 k, Y( N' c1 f
/ i# \0 |3 K6 E/ R2 S9 m% }1 W  W. u& b- |! E1 }# z
[root@controller ~]# ceph health detail
# @6 u  W  n- Q9 L# jHEALTH_WARN Degraded data redundancy: 1 pg undersized
* D: M1 T5 n; I; I) S6 D3 ~[WRN] PG_DEGRADED: Degraded data redundancy: 1 pg undersized
: J5 x5 z) z7 f" X3 ~5 A    pg 1.0 is stuck undersized for 86m, current state active+undersized+remapped, last acting [1,0]( O6 z  D2 ~- d6 v0 F, b6 j6 b+ _

' ?: ~7 c& Z; C* ?3 L" x" x修改配置后,需要重启osd服务:; h  p, U5 g2 a9 W& z2 i
; ]/ g6 N/ {" i' J/ Z
ceph-a4bb5236-c8ca-11eb-a67b-000c29ad02de@osd.0.service  ceph-a4bb5236-c8ca-11eb-a67b-000c29ad02de@osd.3.service8 c# z" P$ o, h# A  \7 D
ceph-a4bb5236-c8ca-11eb-a67b-000c29ad02de@osd.1.service  ceph-a4bb5236-c8ca-11eb-a67b-000c29ad02de@osd.4.service
1 |; @' w- f( _% e& {  Uceph-a4bb5236-c8ca-11eb-a67b-000c29ad02de@osd.2.service  ceph-a4bb5236-c8ca-11eb-a67b-000c29ad02de@osd.5.service# |6 w, v6 G" L& u
[root@controller ~]# systemctl restart ceph-a4bb5236-c8ca-11eb-a67b-000c29ad02de@osd.
; }  B  }$ k6 Z8 h9 K  sceph-a4bb5236-c8ca-11eb-a67b-000c29ad02de@osd.0.service  ceph-a4bb5236-c8ca-11eb-a67b-000c29ad02de@osd.3.service
8 J% L  D0 P# f! R/ yceph-a4bb5236-c8ca-11eb-a67b-000c29ad02de@osd.1.service  ceph-a4bb5236-c8ca-11eb-a67b-000c29ad02de@osd.4.service+ o, a9 l9 X  E$ a
ceph-a4bb5236-c8ca-11eb-a67b-000c29ad02de@osd.2.service  ceph-a4bb5236-c8ca-11eb-a67b-000c29ad02de@osd.5.service
% n% A% Y" Y4 r2 ]# ^% ~[root@controller ~]# systemctl restart ceph-a4bb5236-c8ca-11eb-a67b-000c29ad02de@osd.*
5 u3 |% ?& E& m[root@controller ~]# ceph -s  B* \# {) w, c' O7 _6 c
  cluster:
# f- o* [: P9 y  [  e! V& h: Z; g    id:     a4bb5236-c8ca-11eb-a67b-000c29ad02de% a# t5 J1 z) W1 d6 Y
    health: HEALTH_WARN
# H( {$ v$ S* U8 Z# K            4 osds down
; C6 h; j7 A1 l* Q- w            Degraded data redundancy: 1 pg undersized
) s. ]+ z# f' R( ]' k+ v
( Q7 q/ n+ d1 A* E, j  services:& K+ @* I8 Z  G: ~7 Q5 Z& k& v
    mon: 1 daemons, quorum controller (age 89m)7 o/ ?9 I2 N# C1 P% L
    mgr: controller.horbtx(active, since 88m)
% ~& u0 {  V& v; l' c: H+ K+ O    osd: 6 osds: 2 up (since 0.641904s), 6 in (since 8m)" ], n5 D% S2 V$ N% N" H, _

$ ^% W4 Z, a! k/ T  data:
' e0 F  O' w; q# g( h2 h* ^    pools:   1 pools, 1 pgs
' ^4 t8 z' P" A* w    objects: 0 objects, 0 B
5 C* \0 q) V! s7 D4 m; A. T( K    usage:   6.0 GiB used, 114 GiB / 120 GiB avail5 c  ?, {& v  T; j; l3 @
    pgs:     1 stale+active+undersized+remapped
! W/ Y$ ~, H. G$ u1 S% i1 G * m2 C: z1 D6 f0 e0 \2 R. r( {
[root@controller ~]# ceph -s
/ Y7 g5 \" T# t$ p  cluster:+ s* }4 R8 D) f; B5 F
    id:     a4bb5236-c8ca-11eb-a67b-000c29ad02de1 i$ Z/ d1 s# O3 |
    health: HEALTH_OK
% [. k7 A. I) R6 ]1 k0 r' C , i) q; ?0 q5 k! p9 ?8 Y
  services:: J7 ]$ l$ n( G! W7 c- `
    mon: 1 daemons, quorum controller (age 89m)
  G) C' \: j, G4 a4 C- q    mgr: controller.horbtx(active, since 89m)6 G( h/ z. u* n% E. ~; ^
    osd: 6 osds: 6 up (since 6s), 6 in (since 8m); 1 remapped pgs+ B; C1 C, d6 f# g8 y
7 P; K7 J: {( E0 Z& U& K- V
  data:, T. \) s3 D, z2 q6 s' P
    pools:   1 pools, 1 pgs8 R7 n8 c* E9 i9 `& p: x
    objects: 0 objects, 0 B: ]: t( K& R! u2 z/ p0 Y$ z
    usage:   6.0 GiB used, 114 GiB / 120 GiB avail
; z. V9 \, ~1 U& I. P: Z( @    pgs:     1 active+undersized+remapped# r, f+ y: K& w! O7 w4 H
* l& d" R% l& p( K9 ~% t) Y
[root@controller ~]# ceph -s
2 r! ~9 ^4 V; u* ^+ [! Y  cluster:  u9 H. g$ w, y" i9 @
    id:     a4bb5236-c8ca-11eb-a67b-000c29ad02de/ Z4 I: a' q0 i5 w8 Y3 K6 b
    health: HEALTH_OK5 j* n" ]) l7 Z0 P9 M/ K( _  ^3 b

, D7 G# P( B2 q# F* U  services:6 R+ z# Y, b' k  B
    mon: 1 daemons, quorum controller (age 89m)
  ?( q6 [- R& ~    mgr: controller.horbtx(active, since 89m)/ _/ v+ C$ N% Y7 u' B% y
    osd: 6 osds: 6 up (since 8s), 6 in (since 9m); 1 remapped pgs' z0 f9 |, }! u& j8 R: U% H% x

# v. {* ?5 O& j1 r  data:
, g0 T5 f* s# D4 |, S! }    pools:   1 pools, 1 pgs; Q  {+ F# }4 a  p  O: o0 N
    objects: 0 objects, 0 B7 b* N0 Q0 C  Z- A/ B, S$ i" v
    usage:   6.0 GiB used, 114 GiB / 120 GiB avail" T3 m+ T' o! z6 C5 Q0 k* T
    pgs:     1 active+undersized+remapped$ x, M6 Y" W2 z
% M- ^$ L2 L1 D* l1 p/ a
[root@controller ~]# ceph -s# i/ l" x+ h$ c" R
  cluster:% L. \# _2 x0 L; l. P" D
    id:     a4bb5236-c8ca-11eb-a67b-000c29ad02de
+ _4 k: Z/ Q! [5 P4 q% S; |    health: HEALTH_OK
2 l' S7 I- X. o8 I7 ?% V; r 5 g, G5 F4 G+ ~
  services:- m& ^* k- }& Y' F. v" J
    mon: 1 daemons, quorum controller (age 89m)3 v, @* \3 N) g1 |, u7 D" Z
    mgr: controller.horbtx(active, since 89m); _6 @; a4 i  M4 j2 p  L2 Z7 `
    osd: 6 osds: 6 up (since 9s), 6 in (since 9m); 1 remapped pgs
% j) @5 Q- a3 \
0 ^% h" o% \% ^" H  data:7 t8 p9 a) v) \( `& E$ k
    pools:   1 pools, 1 pgs
) Z, k( o% ^7 Z$ `- }. g+ h    objects: 0 objects, 0 B1 s  z/ S6 E- {1 O
    usage:   6.0 GiB used, 114 GiB / 120 GiB avail- G# K4 C& U4 @$ `! @* L; V: Q; c' `# q
    pgs:     1 active+undersized+remapped
; s; G. Z3 n; m% S1 R
' @: L% d0 j. _: {[root@controller ~]# ceph -s
& A" w0 X9 ^: X: H3 |7 I  M  cluster:
: }. V+ s, |: F. N: s$ ]  Q# i$ u( J( J9 x    id:     a4bb5236-c8ca-11eb-a67b-000c29ad02de
0 h5 u$ a+ H9 R    health: HEALTH_OK0 u  V- r/ q7 s4 W% T+ Q
' J1 b9 k3 z! I2 L2 f% G4 q8 N
  services:  z" r5 ?& N, v" I, y% }# t1 P$ h- V
    mon: 1 daemons, quorum controller (age 89m)
+ Z- J1 N1 k& |0 B5 ?0 y! u+ w    mgr: controller.horbtx(active, since 89m)  v* }6 S4 H2 z
    osd: 6 osds: 6 up (since 10s), 6 in (since 9m); 1 remapped pgs
. q- G6 t4 C0 F, C% C* w  ?4 X
  F: [- e, e1 R. y  data:
6 P: h5 W: q0 |7 ?4 y: {4 p7 }; V    pools:   1 pools, 1 pgs& Q6 O- v0 `: L) o2 T
    objects: 0 objects, 0 B, X+ n1 M, j7 n1 q$ J1 C: w
    usage:   6.0 GiB used, 114 GiB / 120 GiB avail
$ i8 t4 r+ w6 [' `    pgs:     1 active+undersized+remapped
% H6 Z' r7 ]2 Q 1 {8 ]# x* F4 R: `2 {1 C: {$ x8 M
[root@controller ~]# ceph health detail
% u2 `% W0 [; w. S. ?& \HEALTH_OK
' b9 B+ h9 `  ]/ w! K2 K& i8 f[root@controller ~]# ceph health detail $ U6 P2 @" z$ k' K: y9 s
HEALTH_OK
+ e6 F6 {7 v# u, K  ~8 o[root@controller ~]# 0 {# w, t5 M% Y* i1 D$ A2 z

1

主题

0

回帖

12

积分

管理员

积分
12
QQ
 楼主| 发表于 2021-6-9 15:00:17 | 显示全部楼层
3.1.1 说明4 U" X( f, w2 a" L
降级:由上文可以得知,每个PG有三个副本,分别保存在不同的OSD中,在非故障情况下,这个PG是active+clean 状态,那么,如果PG 的 副本osd.4 挂掉了,这个 PG 是降级状态。5 p) r4 X# U# U, P" K$ @) g
3.1.2 故障模拟
4 k+ [) Y* r1 Y4 Da. 停止osd.1
4 A/ E, Z$ v: X) ]6 I2 `" v7 N) A $ systemctl stop ceph-osd@1 , A/ S' w8 e/ P$ B% f* s1 H
b. 查看PG状态6 Q$ l/ X3 Z' W+ o( O- }) x* t
$ bin/ceph pg stat 20 pgs: 20 active+undersized+degraded; 14512 kB data, 302 GB used, 6388 GB / 6691 GB avail; 12/36 objects degraded (33.333%)
. j8 ^# Y: t  x$ E" y' Ic. 查看集群监控状态7 |) H/ m! p3 }9 C9 _, L7 k
$ bin/ceph health detail
* s- v0 j* j( h  {4 B! XHEALTH_WARN 1 osds down; Degraded data redundancy: 12/36 objects degraded (33.333%), 20 pgs unclean, 20 pgs degraded; application not enabled on 1 pool(s)
: q$ p& ~9 m+ ~# k  S0 [OSD_DOWN 1 osds down     5 i4 K) q4 I, u4 J; w. R. b, Z
   osd.1 (root=default,host=ceph-xx-cc00) is down
( H1 s/ |/ \( q1 @4 x' zPG_DEGRADED Degraded data redundancy: 12/36 objects degraded (33.333%), 20 pgs unclean, 20 pgs degraded     % D) g2 e$ V$ q5 K$ `' k. s* a
   pg 1.0 is active+undersized+degraded, acting [0,2]     
  U2 g9 s; Y5 x0 J' l# J8 W   pg 1.1 is active+undersized+degraded, acting [2,0]
9 T' b. n" w0 M- k7 [d. 客户端IO操作
# X8 x6 ~5 K' F7 `7 y #写入对象 9 @2 w$ ?9 Z( Z. u* Q) Y
$ bin/rados -p test_pool put myobject ceph.conf  
8 N; |2 Y  ?/ j( \, P . V# S( ~1 N, s; c+ m* @5 i8 A  q3 w
#读取对象到文件 1 f7 _: ~3 s7 w( g
$ bin/rados -p test_pool get myobject.old  7 j0 j. w5 @, E5 q1 Z
( ]/ X- |3 Z! |$ V
#查看文件 " N, ]7 X3 @. L. @4 T, N
$ ll ceph.conf*
* i# d. ]- Z' ^7 i, m7 D-rw-r--r-- 1 root root 6211 Jun 25 14:01 ceph.conf ; [9 M6 b, k, b5 S7 u# t. b2 k
-rw-r--r-- 1 root root 6211 Jul 3 19:57 ceph.conf.old
" q' f. E' h) `$ |% l故障总结:
2 `0 B3 Q7 u) V为了模拟故障,(size = 3, min_size = 2) 我们手动停止了 osd.1,然后查看PG状态,可见,它此刻的状态是active+undersized+degraded,当一个 PG 所在的 OSD 挂掉之后,这个 PG 就会进入undersized+degraded 状态,而后面的[0,2]的意义就是还有两个副本存活在 osd.0 和 osd.2 上, 并且这个时候客户端可以正常读写IO。% a6 y2 G& a: w, `, r
3.1.3 总结. N! s& K2 W' Z, x  D
降级就是在发生了一些故障比如OSD挂掉之后,Ceph 将这个 OSD 上的所有 PG 标记为 Degraded。. P; k6 s2 b9 c
降级的集群可以正常读写数据,降级的 PG 只是相当于小毛病而已,并不是严重的问题。
* S$ F/ z( w* X+ tUndersized的意思就是当前存活的PG 副本数为 2,小于副本数3,将其做此标记,表明存货副本数不足,也不是严重的问题。: F) |9 n' C8 G
3.2 Peered
$ r" F) Z5 l- H* b* d; r/ d+ k7 y3.2.1 说明/ h0 |* C) c2 {( u) a7 L1 ?/ a' V# p
Peering已经完成,但是PG当前Acting Set规模小于存储池规定的最小副本数(min_size)。
3 K& V4 Z* G5 w$ ^3.2.2 故障模拟
" q- R' P# E2 S" z& ma. 停掉两个副本osd.1,osd.0: d2 @! c, _$ ]5 _6 T
$ systemctl stop ceph-osd@1
- r8 g3 Q  l2 R4 z  m$ s  K $ systemctl stop ceph-osd@0 + P& t' Z# w7 q4 W' N) [% G

: D1 D% X: L7 l6 {7 k/ |5 Q
+ W1 @4 U! ~5 T8 k8 _3.2.1 说明8 h8 S" V! ?$ d+ ?1 y8 X: \9 u
Peering已经完成,但是PG当前Acting Set规模小于存储池规定的最小副本数(min_size)。
' c& g& w1 @4 q$ z3.2.2 故障模拟
9 _( W/ A( z# {6 J* B$ x. ~. H, n" O* M1 |
a. 停掉两个副本osd.1,osd.0
" B9 X% s, J7 q' y9 a+ s" O
+ _9 X- _8 L( x1 V2 b4 ^. R $ systemctl stop ceph-osd@1
: r- ~& ?, }1 M5 D) S9 T5 o$ O $ systemctl stop ceph-osd@0
+ e. ]/ j/ ]3 R) [1 U2 p" I# p1 A
6 q3 O9 V; a1 Nb. 查看集群健康状态
+ |+ V. p5 i7 x+ V
% M5 f- V, f* V& j" U: D $ bin/ceph health detail ) S# @" @3 b% j% h6 t! @; t( D1 o1 N
HEALTH_WARN 1 osds down; Reduced data availability: 4 pgs inactive; Degraded data redundancy: 26/39 objects degraded (66.667%), 20 pgs unclean, 20 pgs degraded; application not enabled on 1 pool(s)
6 g1 e; |& Q3 a, {5 {OSD_DOWN 1 osds down     
5 r. ^4 R* N; }, e- t3 ]    osd.0 (root=default,host=ceph-xx-cc00) is down
7 G7 Q5 Y8 p' o9 HPG_AVAILABILITY Reduced data availability: 4 pgs inactive     . h& _: j: r) _! q  e+ o4 g" H
    pg 1.6 is stuck inactive for 516.741081, current state undersized+degraded+peered, last acting [2]     
  c6 r# _! G5 ^* _) U! {: d    pg 1.10 is stuck inactive for 516.737888, current state undersized+degraded+peered, last acting [2]     
$ N4 a7 @+ B3 {8 h# J2 R9 E    pg 1.11 is stuck inactive for 516.737408, current state undersized+degraded+peered, last acting [2]     ( l7 R/ y  D. q- F! `
    pg 1.12 is stuck inactive for 516.736955, current state undersized+degraded+peered, last acting [2] ( \" c  t5 C: d) ]$ k( y
PG_DEGRADED Degraded data redundancy: 26/39 objects degraded (66.667%), 20 pgs unclean, 20 pgs degraded     & e4 n# e8 N) b
    pg 1.0 is undersized+degraded+peered, acting [2]     
3 ?( _2 y% Q7 ]' h0 g' B    pg 1.1 is undersized+degraded+peered, acting [2] 5 M9 e0 {9 Z* j/ F* e% @
c. 客户端IO操作(夯住)
8 |9 V0 K/ P: ]% g$ Q0 k- }
4 r7 |, [" V& I( j& w5 F #读取对象到文件,夯住IO
5 v- E/ O( ?4 x& }, Z$ bin/rados -p test_pool get myobject  ceph.conf.bak 1 k* R0 Y' B: F% Z+ y  l! A
故障总结:. m# S  O0 q! U, l4 }

! ^; s$ |1 g& U3 d! ~) I/ B现在pg 只剩下osd.2上存活,并且 pg 还多了一个状态:peered,英文的意思是仔细看,这里我们可以理解成协商、搜索。# J9 Z5 U" ^# v5 g2 ~1 |
这时候读取文件,会发现指令会卡在那个地方一直不动,为什么就不能读取内容了,因为我们设置的 min_size=2 ,如果存活数少于2,比如这里的 1 ,那么就不会响应外部的IO请求。8 E# z. V( |( i( {
d. 调整min_size=1可以解决IO夯住问题
0 C7 o1 }* ]- t) U/ I: e8 b: h- F6 _0 N
#设置min_size = 1 $ p* {2 O# x& v: g9 N
$ bin/ceph osd pool set test_pool min_size 1
; y# L! K) q" P1 gset pool 1 min_size to 1
5 g- i( q* y, H6 K6 y+ te. 查看集群监控状态
4 X. D. L/ j. S% _+ v( R4 l# i; |" l9 D0 I. x7 D$ u5 ^
$ bin/ceph health detail
& D/ x- L/ e( e$ w  A$ O7 k; m9 QHEALTH_WARN 1 osds down; Degraded data redundancy: 26/39 objects degraded (66.667%), 20 pgs unclean, 20 pgs degraded, 20 pgs undersized; application not enabled on 1 pool(s) 8 E" T* m; g& Z7 d& x; c# }! j
OSD_DOWN 1 osds down     
6 A  Y6 Z. y$ S   osd.0 (root=default,host=ceph-xx-cc00) is down。
, z% }) b- l: W# ?4 qPG_DEGRADED Degraded data redundancy: 26/39 objects degraded (66.667%), 20 pgs unclean, 20 pgs degraded, 20 pgs undersized     1 P8 N7 o& `, B" _
pg 1.0 is stuck undersized for 65.958983, current state active+undersized+degraded, last acting [2]     
, R4 t# a8 c7 D3 \pg 1.1 is stuck undersized for 65.960092, current state active+undersized+degraded, last acting [2]     $ L" l" o2 |' W+ j& d: q
pg 1.2 is stuck undersized for 65.960974, current state active+undersized+degraded, last acting [2]
  c9 i; |( d' J$ P( If. 客户端IO操作
) L$ r" n2 X3 w& ?9 v' z3 U& ^' w6 o
#读取对象到文件中 ' l9 y: _9 s2 I/ d- w1 W
$ ll -lh ceph.conf*
& y# J9 L* }, Y' l  y-rw-r--r-- 1 root root 6.1K Jun 25 14:01 ceph.conf
, Q( }( _( I' Q0 M) k5 A-rw-r--r-- 1 root root 6.1K Jul 3 20:11 ceph.conf.bak
5 J; i3 B+ Y1 U% Q" j-rw-r--r-- 1 root root 6.1K Jul 3 20:11 ceph.conf.bak.1
- v% B* [; U% K* A故障总结:
6 p1 @& m8 B$ K' J* i5 g5 D- W$ s$ R, ?# J  ~
可以看到,PG状态Peered没有了,并且客户端文件IO可以正常读写了。
) b  U* L$ b8 _3 H" P+ M当min_size=1时,只要集群里面有一份副本活着,那就可以响应外部的IO请求。1 U$ w+ U# A9 P$ A) {0 n+ }7 ]3 v& u

. W5 [3 J6 M) E7 W7 w* n/ d* ~: U
您需要登录后才可以回帖 登录 | 注册

本版积分规则

返回首页|Archiver|手机版|小黑屋|易陆发现技术论坛 ( 蜀ICP备2026014127号-1 )

GMT+8, 2026-6-12 01:05 , Processed in 0.018643 second(s), 22 queries .

Powered by Discuz! X5.0

© 2001-2026 Discuz! Team.

快速回复 返回顶部 返回列表