|
|
# ceph health detail
$ l9 B. Q; q" u* EHEALTH_WARN 1 pools have many more objects per pg than average" N* L% N* ?4 K! d; v4 f3 q* U
MANY_OBJECTS_PER_PG 1 pools have many more objects per pg than average
' U" D; F5 Z7 @$ L9 a8 L2 H pool pool-hdd-2 objects per pg (5503) is more than 12.9482 times cluster average (425)
) F, U, ?% O, M, S! {0 `3 Y) r( r5 t6 A" \
7 s& x% c) O$ t/ F, R" y$ |5 Q
定位问题+ x1 C8 y6 r# E: k+ T
[root@lab8106 ~]# ceph -s
# E( d( C3 o; c. [4 P3 k4 e0 @4 X cluster fa7ec1a1-662a-4ba3-b478-7cb570482b62- M% C3 V7 }) P9 z0 D
health HEALTH_WARN* t( o; h% n x% d
pool rbd has many more objects per pg than average (too few pgs?)
& D# r. Z$ j i! W monmap e1: 1 mons at {lab8106=192.168.8.106:6789/0} h8 i3 m9 U2 n' Z/ o* Z/ M- V
election epoch 30, quorum 0 lab81061 @1 a# P) `5 M5 A$ R" w0 D8 k
osdmap e157: 2 osds: 2 up, 2 in! s- a+ x g2 L& K$ C2 [
flags sortbitwise
; O2 Y h9 i/ {# `0 V( [9 D7 | pgmap v1023: 417 pgs, 13 pools, 18519 MB data, 15920 objects
0 I. V, } G+ Z+ N- ~ 18668 MB used, 538 GB / 556 GB avail
6 x0 J4 T! }0 w5 |1 B- d 417 active+clean2 r3 E( T. Z, z- G h9 \
集群出现了这个警告,pool rbd has many more objects per pg than average (too few pgs?) 这个警告在hammer版本里面的提示是 pool rbd has too few pgs
: o" t) ^7 T2 @# C {" E9 G这个地方查看集群详细信息:
. ^1 f! R- u4 i- I4 G1 A[root@lab8106 ~]# ceph health detail
2 x4 ]3 J( M1 THEALTH_WARN pool rbd has many more objects per pg than average (too few pgs?); mon.lab8106 low disk space
8 R: i% x" q; Cpool rbd objects per pg (1912) is more than 50.3158 times cluster average (38)
t6 X" M) A- b. A& N5 @' ]5 A看下集群的pool的对象状态
7 W7 [) |) S* s+ \5 ] }[root@lab8106 ~]# ceph df! [) e0 u l! N4 |
GLOBAL:
. P4 U0 K" m' |: | SIZE AVAIL RAW USED %RAW USED
" s5 B' |$ k- R# q 556G 538G 18668M 3.28 ; F' q @- l6 o/ b- d
POOLS:3 n( Y% ^8 f8 C/ B3 F
NAME ID USED %USED MAX AVAIL OBJECTS - |8 t9 y9 M% q# t
rbd 6 16071M 2.82 536G 15296 - R/ O: [8 e w' u$ s8 L
pool1 7 204M 0.04 536G 52 " c9 c/ w, m- M8 w+ Y
pool2 8 184M 0.03 536G 47
! X4 r# l6 v# H7 Z5 f8 u6 \ _ pool3 9 188M 0.03 536G 48 % v; }+ Y g9 d9 }
pool4 10 192M 0.03 536G 49 B: B4 |6 X- ?: G# ~
pool5 11 204M 0.04 536G 52
- a& m: f3 k1 r3 c, s) }, ] pool6 12 148M 0.03 536G 38 5 U7 {- z$ U% f
pool7 13 184M 0.03 536G 47 9 p# Z8 G* ?' t. u/ E3 \( k
pool8 14 200M 0.04 536G 51
! k* ]/ N8 p: ~) |3 T pool9 15 200M 0.04 536G 51 5 s& F+ q% F7 \ r9 i }! t
pool10 16 248M 0.04 536G 63 + u7 ^4 V3 j% }
pool11 17 232M 0.04 536G 59
* o* C1 h: n4 C; P" d" F pool12 18 264M 0.05 536G 67
; [) h- P8 i9 O4 x查看存储池的pg个数
4 t! O# |* a1 v4 e[root@lab8106 ~]# ceph osd dump|grep pool \1 P: l' L8 x- c9 w0 Y
pool 6 'rbd' replicated size 1 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 8 pgp_num 8 last_change 132 flags hashpspool stripe_width 0' B# D7 h- v7 ^' b
pool 7 'pool1' replicated size 1 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 1 pgp_num 1 last_change 134 flags hashpspool stripe_width 0. o; i2 X c3 H9 Y: J
pool 8 'pool2' replicated size 1 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 1 pgp_num 1 last_change 136 flags hashpspool stripe_width 01 V6 _# z3 q/ ~, y! N7 I8 ?
pool 9 'pool3' replicated size 1 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 1 pgp_num 1 last_change 138 flags hashpspool stripe_width 0/ V" g5 S9 F" m
pool 10 'pool4' replicated size 1 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 1 pgp_num 1 last_change 140 flags hashpspool stripe_width 0; Z1 A/ f; z0 H: ?6 `6 R
pool 11 'pool5' replicated size 1 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 1 pgp_num 1 last_change 142 flags hashpspool stripe_width 0
) O* J8 j) h5 ?) |* upool 12 'pool6' replicated size 1 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 1 pgp_num 1 last_change 144 flags hashpspool stripe_width 0( e5 }; P+ D7 H% R5 g
pool 13 'pool7' replicated size 1 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 1 pgp_num 1 last_change 146 flags hashpspool stripe_width 0
( l z. y6 J' h- Z8 ]& Q* g D* spool 14 'pool8' replicated size 1 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 1 pgp_num 1 last_change 148 flags hashpspool stripe_width 0/ ?% E6 D5 ?: ]/ i- t, h5 l
pool 15 'pool9' replicated size 1 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 1 pgp_num 1 last_change 150 flags hashpspool stripe_width 0 p- r7 j) z, q8 i8 a
pool 16 'pool10' replicated size 1 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 100 pgp_num 100 last_change 152 flags hashpspool stripe_width 0( M; S" g" k! Y0 O9 i& ^
pool 17 'pool11' replicated size 1 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 100 pgp_num 100 last_change 154 flags hashpspool stripe_width 0
, {# L. B8 K0 i1 V* T. Xpool 18 'pool12' replicated size 1 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 200 pgp_num 200 last_change 156 flags hashpspool stripe_width 0
( d5 Z M* E- R+ q我们看下这个是怎么得到的9 q. n3 S ?) R) ]
pool rbd objects per pg (1912) is more than 50.3158 times cluster average (38)* T" Z+ ~3 S1 ~4 T4 _
rbd objects_per_pg = 15296 / 8 = 1912
+ H3 U$ w9 M& h/ x/ q+ p2 x$ `; `objects_per_pg = 15920 /417 ≈ 38
$ d( l; T2 U# Q# m4 E' L& |/ F50.3158 = rbd objects_per_pg / objects_per_pg = 1912 / 38
2 D; R$ M1 v2 U6 G* @1 R也就是出现其他pool的对象太少,而这个pg少,对象多,就会提示这个了,我们看下代码里面的判断
0 `& z# O6 X* k H# P9 B' Shttps://github.com/ceph/ceph/blob/master/src/mon/PGMonitor.cc
, J9 L* J) P! d, A! n7 Z int average_objects_per_pg = pg_map.pg_sum.stats.sum.num_objects / pg_map.pg_stat.size();% x! s. d& M& V
if (average_objects_per_pg > 0 &&& i! F0 Z6 g/ _! }
pg_map.pg_sum.stats.sum.num_objects >= g_conf->mon_pg_warn_min_objects &&2 h# \& H0 G$ K; Z3 v$ t5 x
p->second.stats.sum.num_objects >= g_conf->mon_pg_warn_min_pool_objects) {
, Q+ t2 X A" z+ x9 E& V0 O int objects_per_pg = p->second.stats.sum.num_objects / pi->get_pg_num();$ k5 z" R* Q; G& }5 k: s
float ratio = (float)objects_per_pg / (float)average_objects_per_pg;4 Z: a# W/ B: z6 e5 f7 r
if (g_conf->mon_pg_warn_max_object_skew > 0 &&4 K7 d% _* p$ n3 p
ratio > g_conf->mon_pg_warn_max_object_skew) {
- O) q6 L6 X1 [$ a1 ~, V0 V5 D ostringstream ss;4 U( M1 d: {4 ?4 {# E7 S2 y! t7 ]
ss << "pool " << name << " has many more objects per pg than average (too few pgs?)";
* |) a* a' E& `9 B4 \0 n summary.push_back(make_pair(HEALTH_WARN, ss.str()));# E0 C$ k! I# |5 I2 Z7 w! F% v' F
if (detail) {
' h& j1 p! \% I' e/ t ostringstream ss;
* R+ ?! ?8 J7 d2 {7 H9 u6 y A) { ss << "pool " << name << " objects per pg ("
0 ?# S+ s# ~& E& F << objects_per_pg << ") is more than " << ratio << " times cluster average ("5 V# Q0 y: F' I- E5 T
<< average_objects_per_pg << ")";
: u4 B! m% U C0 y2 [. c1 p3 s detail->push_back(make_pair(HEALTH_WARN, ss.str()));4 q; x1 j5 d9 H/ N: ?; I
}
2 C5 i! P) G1 o! A$ h! q7 }( Z( `主要下面的几个限制条件
# q1 \' l. e4 f8 e; }5 s! Tmon_pg_warn_min_objects = 10000 //总的对象超过100009 i! I( r$ Y4 o# `
mon_pg_warn_min_pool_objects = 1000 //存储池对象超过1000( F7 m- A; m3 M+ J j( {
mon_pg_warn_max_object_skew = 10 //就是上面的存储池的平均对象与所有pg的平均值的倍数关系
% R, X; L* _1 l. H7 y# A# O- {解决问题 P8 O( `; B* |5 ^; u
有三个方法解决这个警告的提示:& ]% L7 t2 G- H, F2 Y M9 G: E
删除无用的存储池
$ v% r3 F4 F& L4 Z* V! P; l如果集群中有一些不用的存储池,并且相对的pg数目还比较高,那么可以删除一些这样的存储池,从而降低mon_pg_warn_max_object_skew这个值,警告就会没有了 q5 t7 m ~) j4 s- \: K4 i* z
增加提示的pool的pg数目8 P% v) a4 t8 h5 N+ `
有可能的情况就是,这个存储池的pg数目从一开始就不够,增加pg和pgp数目,同样降低了mon_pg_warn_max_object_skew这个值了
3 w* D4 X# u' V/ `0 E5 w+ a增加mon_pg_warn_max_object_skew的参数值3 I+ R w- D( n( |# x2 k$ Q
如果集群里面已经有足够多的pg了,再增加pg会不稳定,如果想去掉这个警告,就可以增加这个参数值,默认为10( _6 A X* C; T6 n, [( F- \
总结: v: a& v8 T6 P
这个警告是比较的是存储池中的对象数目与整个集群的pg的平均对象数目的偏差,如果偏差太大就会发出警告
4 G0 l# T4 u' r. r4 J4 Z检查的步骤:. C- f8 Q. T' s
ceph health detail
' q) k6 u/ y3 h- s) Bceph df" R2 x8 R- Z# M8 P: A
ceph osd dump | grep pool/ e; E0 W; D3 ?, b4 I5 r# |
mon_pg_warn_max_object_skew = 10.0
5 C( I* ]& b$ r7 f# u0 I((objects/pg_num) in the affected pool)/(objects/pg_num in the entire system) >= 10.0 警告就会出现
, {4 W3 h3 U5 K) @+ ~% r' S3 M变更记录' k3 f7 {3 g( T. ]! D5 k
3 U. X' ]8 _8 J; G; p6 y4 Z( b7 Y
" V8 n+ t: Q$ a
/ D; Y" p! v r' d% J5 x3 d( o; k9 H
|
|