|
|
# ceph health detail " Y6 P; a7 V- X$ @$ V
HEALTH_WARN 1 pools have many more objects per pg than average
* }+ L ?+ I, h/ v) ZMANY_OBJECTS_PER_PG 1 pools have many more objects per pg than average t! H% `4 \! a- P: d* B- e
pool pool-hdd-2 objects per pg (5503) is more than 12.9482 times cluster average (425)
8 G9 Q, H) o- G+ Q7 U; t; o
# a1 I: L! ?+ y+ C
- A0 x9 M) J; |$ v; U! H( S3 ?定位问题" y# `. `0 I( p. y: f8 T
[root@lab8106 ~]# ceph -s
4 g* Y( d. x I6 C) h cluster fa7ec1a1-662a-4ba3-b478-7cb570482b62$ f2 ~' h/ J' d& }0 \
health HEALTH_WARN
. X0 E9 M7 s) I, q1 f+ D9 g5 f pool rbd has many more objects per pg than average (too few pgs?)
( X- s6 X5 R* C4 i2 J& K monmap e1: 1 mons at {lab8106=192.168.8.106:6789/0}5 H' i, F! }. i" C, k
election epoch 30, quorum 0 lab8106
1 U" \ M4 F2 b9 c- R+ M osdmap e157: 2 osds: 2 up, 2 in4 y8 f! g; T x* }
flags sortbitwise
* v# Y/ k, O* f' N+ D1 Y. [: ? pgmap v1023: 417 pgs, 13 pools, 18519 MB data, 15920 objects
0 q+ i- s& X# l0 l! k7 I 18668 MB used, 538 GB / 556 GB avail: H0 J# t/ G& W+ K/ L5 M8 W' u
417 active+clean% u' I; C0 A: T- _( H& ]% G/ K; R
集群出现了这个警告,pool rbd has many more objects per pg than average (too few pgs?) 这个警告在hammer版本里面的提示是 pool rbd has too few pgs
! U) _& a$ T3 a( I$ ~' a( }2 Y这个地方查看集群详细信息:
1 z: ~( C9 O' g3 R6 H9 Y[root@lab8106 ~]# ceph health detail9 f1 Q: A' {8 t* \- K8 c5 D
HEALTH_WARN pool rbd has many more objects per pg than average (too few pgs?); mon.lab8106 low disk space
8 a# K# s0 z/ ?: G" V5 l# ypool rbd objects per pg (1912) is more than 50.3158 times cluster average (38)
+ }( y6 \4 h# t5 |看下集群的pool的对象状态0 y/ x2 k$ M0 f1 r- f
[root@lab8106 ~]# ceph df
# z1 D% p- a" b( Q$ t. M% jGLOBAL:
- J& f4 y a/ w# S( G2 a1 E SIZE AVAIL RAW USED %RAW USED 1 k+ A. Q" { b) w# E! z; b9 E
556G 538G 18668M 3.28 5 N& @- x3 d; u4 N
POOLS:
5 _5 A* m0 C7 r/ `5 C% R' e NAME ID USED %USED MAX AVAIL OBJECTS
0 c5 J; @& j4 x; s/ m. z: _ rbd 6 16071M 2.82 536G 15296
, O' o& n4 m% O: u' [* n+ H2 M# K pool1 7 204M 0.04 536G 52 : H0 W5 R3 j9 r: l0 R% h
pool2 8 184M 0.03 536G 47
# f& q( m7 F' \$ i7 R4 E8 C pool3 9 188M 0.03 536G 48
! [: ^3 C Q3 z! H2 V# s$ a, X pool4 10 192M 0.03 536G 49 5 F; F! l M* @) v; ?$ W3 F
pool5 11 204M 0.04 536G 52 8 y, [- b N0 f; m+ k; r# \9 K
pool6 12 148M 0.03 536G 38 * W$ ? e7 v3 N
pool7 13 184M 0.03 536G 47 + d# n4 U6 x: r) J$ ?3 b# A" W
pool8 14 200M 0.04 536G 51 6 e2 Q7 r/ s% T& k$ Q
pool9 15 200M 0.04 536G 51 2 D" \7 J7 d( b+ Z
pool10 16 248M 0.04 536G 63
" ` l2 U8 h4 U+ U; k pool11 17 232M 0.04 536G 59 . s! ^) t4 i2 x% }% U% v" W
pool12 18 264M 0.05 536G 67
2 f5 K( X) d" O1 `2 d$ V6 |查看存储池的pg个数
, P( n% S8 I' x1 A: w[root@lab8106 ~]# ceph osd dump|grep pool# h: ~ H4 i( K5 I
pool 6 'rbd' replicated size 1 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 8 pgp_num 8 last_change 132 flags hashpspool stripe_width 0
* r! b& K' f% O. P- tpool 7 'pool1' replicated size 1 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 1 pgp_num 1 last_change 134 flags hashpspool stripe_width 0' o4 Q2 f9 D- ?% W2 S/ ~) D
pool 8 'pool2' replicated size 1 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 1 pgp_num 1 last_change 136 flags hashpspool stripe_width 03 a; ~7 G. ]( x, C* P2 Y q
pool 9 'pool3' replicated size 1 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 1 pgp_num 1 last_change 138 flags hashpspool stripe_width 0( c; O8 x, V5 F) W1 ]2 v/ [! O
pool 10 'pool4' replicated size 1 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 1 pgp_num 1 last_change 140 flags hashpspool stripe_width 0/ M( @. e0 P# l4 b! z
pool 11 'pool5' replicated size 1 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 1 pgp_num 1 last_change 142 flags hashpspool stripe_width 0. O0 q2 Z, J& H6 ^, P2 k
pool 12 'pool6' replicated size 1 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 1 pgp_num 1 last_change 144 flags hashpspool stripe_width 04 a4 M5 O, ~4 P Q6 v- A
pool 13 'pool7' replicated size 1 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 1 pgp_num 1 last_change 146 flags hashpspool stripe_width 0: d% j/ q$ y( v2 {+ [
pool 14 'pool8' replicated size 1 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 1 pgp_num 1 last_change 148 flags hashpspool stripe_width 01 y- v* J5 D! O. X
pool 15 'pool9' replicated size 1 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 1 pgp_num 1 last_change 150 flags hashpspool stripe_width 0
# p2 \/ _" z7 D/ U9 s" k, y4 D8 Fpool 16 'pool10' replicated size 1 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 100 pgp_num 100 last_change 152 flags hashpspool stripe_width 0
+ o9 |* T) G, H- o3 A& w6 W! wpool 17 'pool11' replicated size 1 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 100 pgp_num 100 last_change 154 flags hashpspool stripe_width 0# U! b+ n+ n: ?
pool 18 'pool12' replicated size 1 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 200 pgp_num 200 last_change 156 flags hashpspool stripe_width 0
" S2 C3 v, [$ X- A( E5 e我们看下这个是怎么得到的* q( ^) M3 k% }' i
pool rbd objects per pg (1912) is more than 50.3158 times cluster average (38)6 L. U; e" y" |' e6 K0 q
rbd objects_per_pg = 15296 / 8 = 1912
9 N9 {& R6 k8 g1 L* M" [: ^4 m iobjects_per_pg = 15920 /417 ≈ 38
# R' S/ {4 A3 X/ ~1 n) N50.3158 = rbd objects_per_pg / objects_per_pg = 1912 / 38
$ M$ a: D5 U" W% B1 C也就是出现其他pool的对象太少,而这个pg少,对象多,就会提示这个了,我们看下代码里面的判断
1 S$ w# G, `& z* khttps://github.com/ceph/ceph/blob/master/src/mon/PGMonitor.cc
: }* y2 V$ {' D0 J! H- v, k& X' Z int average_objects_per_pg = pg_map.pg_sum.stats.sum.num_objects / pg_map.pg_stat.size();; f( b' `% \$ R9 C" Y) N5 d; k3 I, t
if (average_objects_per_pg > 0 &&
; X8 |! W% T p1 H7 M% s+ G- u# |. j% u pg_map.pg_sum.stats.sum.num_objects >= g_conf->mon_pg_warn_min_objects &&
5 l7 C" V5 N; i0 g O p->second.stats.sum.num_objects >= g_conf->mon_pg_warn_min_pool_objects) {, W# y( l! x4 Z: Y ?+ ?; `
int objects_per_pg = p->second.stats.sum.num_objects / pi->get_pg_num();# i# q% q7 `5 u8 S% R/ w
float ratio = (float)objects_per_pg / (float)average_objects_per_pg;
: h% F4 W+ z1 y9 ^ if (g_conf->mon_pg_warn_max_object_skew > 0 &&9 y( [5 j1 Y5 w! Q! S o
ratio > g_conf->mon_pg_warn_max_object_skew) {
9 {8 h" m: Q; F4 w0 q: u ostringstream ss;
3 l* C: `5 x2 o' y, o) ?: Q& Z ss << "pool " << name << " has many more objects per pg than average (too few pgs?)";
1 P, ]) X; l4 H7 G7 ~. T" a7 m summary.push_back(make_pair(HEALTH_WARN, ss.str()));$ \. L- m& U; o- }( D
if (detail) {' U2 G& D* n4 r. B7 V* z( R- W
ostringstream ss;
- A F1 E% c1 M: M2 y ss << "pool " << name << " objects per pg ("
7 `6 x0 O% f: F3 l << objects_per_pg << ") is more than " << ratio << " times cluster average ("6 s: y% c2 L5 {, f G/ l9 ]& O! B6 N
<< average_objects_per_pg << ")";
; |6 D, g& f2 y' m" C: |: l. F B$ s detail->push_back(make_pair(HEALTH_WARN, ss.str()));
# E9 J/ G, e j1 k; A+ o4 D }2 x! p$ @! Z: ?+ q. J
主要下面的几个限制条件7 F K% b& c8 k/ e
mon_pg_warn_min_objects = 10000 //总的对象超过10000 Z; Z, B/ c: T
mon_pg_warn_min_pool_objects = 1000 //存储池对象超过1000
% b0 v1 g) {, e: x- I; S5 V# jmon_pg_warn_max_object_skew = 10 //就是上面的存储池的平均对象与所有pg的平均值的倍数关系$ Z$ d& M. [! [( C/ n6 l! ?
解决问题
. b' g( ?3 \0 U0 n" R' E8 ]有三个方法解决这个警告的提示:& C8 t: m: N, @1 C- Z
删除无用的存储池1 A9 d* {5 h o6 n4 }
如果集群中有一些不用的存储池,并且相对的pg数目还比较高,那么可以删除一些这样的存储池,从而降低mon_pg_warn_max_object_skew这个值,警告就会没有了5 o, x/ i2 M! p6 g( [1 y/ d
增加提示的pool的pg数目4 n1 B* l7 F; j
有可能的情况就是,这个存储池的pg数目从一开始就不够,增加pg和pgp数目,同样降低了mon_pg_warn_max_object_skew这个值了
. P! S$ J! `% ?% n增加mon_pg_warn_max_object_skew的参数值
9 B& _0 l4 \2 v4 M( h) n如果集群里面已经有足够多的pg了,再增加pg会不稳定,如果想去掉这个警告,就可以增加这个参数值,默认为10: S, S" ]6 b) q" N7 b6 Q* V) c
总结
0 P1 A0 i- ~. D& d9 A7 F; N这个警告是比较的是存储池中的对象数目与整个集群的pg的平均对象数目的偏差,如果偏差太大就会发出警告
( e: d5 a9 C; I+ C1 M/ e- O检查的步骤:
5 o1 V: s7 }! L& `ceph health detail
& \# ~; ^) S n& V7 a0 kceph df
$ @" S, g5 n" a9 O) Mceph osd dump | grep pool$ k' d+ s- M- ~( Q3 e
mon_pg_warn_max_object_skew = 10.0- [- l; ^; l( S ~1 l2 @% W9 k8 m
((objects/pg_num) in the affected pool)/(objects/pg_num in the entire system) >= 10.0 警告就会出现
! O; A) b" s/ {- ]& _' a变更记录/ V3 t+ Z+ m% r: f8 Q, {7 G
* c+ v' O) v; |8 g Y8 K) w
3 \; l* h" m" ^* _$ O6 f6 W
" s3 f0 k; F9 r$ m3 z U
|
|