找回密码
 注册
查看: 597|回复: 2

1 Large omap objects ceph health deatil

[复制链接]

1

主题

0

回帖

12

积分

管理员

积分
12
QQ
发表于 2022-8-19 17:00:37 | 显示全部楼层 |阅读模式
Large omap objects# r8 u* _* ?+ Q! R% e) h8 f) Q
# ceph health detail
$ e3 r8 w  g* o* O5 P5 Y) k) H* NHEALTH_WARN 1 large omap objects
* u& Y" A' x5 vLARGE_OMAP_OBJECTS 1 large omap objects
) p. u  C/ t/ g! Y  c    1 large objects found in pool 'is_recovery' #出现large omap的pool- n9 B3 O1 H' M; o
    Search the cluster log for 'Large omap object found' for more details.  l0 |1 r0 u2 @% ^- t! P

- b- k' i$ ]8 l2 s  E  ~3 q2 w$ r3 g. y: s

/ Z4 v  w7 c. o0 q6 a7 M: {
" {$ ?. |# i# `/ Yceph pg ls-by-pool  is_recovery|awk '{print "ceph pg "$1 " query|grep num_large_omap_objects"}'|sh -x
: _  R1 ?, F8 S$ x$ M8 \ceph pg 11.0 query|grep num_large_omap_objects
1 ^; N1 d. |) Kceph pg 11.1 query|grep num_large_omap_objects
4 P! |" K; [' ^& d6 {ceph pg 11.2 query|grep num_large_omap_objects
6 P9 _- J( F/ C  B6 P6 `3 j7 y8 @0 ^) s* r, G# y
+ m9 u" }( o  R+ ~+ p
7 n& E4 W' w/ @6 p
) L6 v$ G! U( R2 O; a, k. G
[root@ceph-1 ~]# ceph daemon mds.ceph-1 flush journal
) D7 K4 o, {5 \: A7 }{
3 k( P& d& J$ Q7 _, i4 V2 K    "message": "",9 g# v2 u# ?& Q
    "return_code": 03 v4 s1 N7 l1 }9 G5 [/ \5 {6 d
}/ c+ [, h" n. q7 V& r$ o
[root@ceph-1 ~]#8 x! y* U7 H% J6 F; A; @# v+ T
[root@ceph-2 ~]# ceph daemon mds.ceph-2 flush journal
% n+ y' ?) ~5 R3 `" Q% I. h"mds_not_active"
- `& G3 ]7 A7 }1 l7 z[root@ceph-2 ~]# ceph daemon mds.ceph-2 flush journal
% t: o1 `: W2 V5 X; Q* M"mds_not_active"
* ]! w+ S+ D1 w3 D5 z
+ X$ F+ B4 ]* y0 H9 w5 j: u' Q' j
. w; f% v$ U; b: L2 O
7 |' l8 t: H# s( @0 Z
' K/ s" q. u% @/ S, Q3 H. B) ]9 x, R) p. O7 K+ N

1

主题

0

回帖

12

积分

管理员

积分
12
QQ
 楼主| 发表于 2022-8-23 09:53:54 | 显示全部楼层
index pool的 large omap 处理; E$ c; v* }0 h, n
向单个bucket压测2000W个object,默认设置shard数为16,压测到1800W出现large omap,介绍一下错误定位和如何处理。7 [  O  \+ n3 }/ p

! k; q$ w: M' E异常定位
. N2 I3 w7 X; d" l2 v3 S集群状态如下. a" }( g+ T* w$ o8 S- X
& m/ F" {9 a* v3 W
[root@demo123 cephuser]# ceph health detail
0 u$ E5 G- X# z9 }2 k& [HEALTH_WARN 16 large omap objects
5 X! K' x" K4 T; Y8 `: xLARGE_OMAP_OBJECTS 16 large omap objects
$ V- G4 _: k8 Q    16 large objects found in pool 'cn-bj-test2.rgw.buckets.index'- T1 i$ p! C0 I# i& X* c, p
    Search the cluster log for 'Large omap object found' for more details.; ?0 e2 p/ o8 n) z
复制
# k* W$ O+ }! U/ M3 V; w通过脚本找到对应的pg信息,脚本请查看之前一篇omap large处理的文章。- ^, ~0 }9 |+ W& |8 M7 C

" O8 f) L- t" V[root@demo123 cephuser]# python large_omap.py' h% P  O& a! @3 A0 p" y7 I; H- i
Large omap objects poolname = cn-bj-test2.rgw.buckets.index" N! ?* o/ B- \, G# ~/ T! Q
pgid=13.1f OSDs=[78, 9, 59] num_large_omap_objects=1, q4 e, i+ R8 ~, @2 X+ i& ~! E6 T  G4 I
pgid=13.33 OSDs=[59, 79, 19] num_large_omap_objects=1
" n6 U, ]+ `1 h" Ipgid=13.3c OSDs=[49, 29, 78] num_large_omap_objects=1- g; e* [+ o  C  [- y9 C+ `' V
pgid=13.3d OSDs=[48, 69, 9] num_large_omap_objects=1
8 `: }( u7 E% V: g3 @pgid=13.45 OSDs=[88, 39, 28] num_large_omap_objects=1( i) x+ i3 D8 x/ g' s% C6 A- ^
pgid=13.4d OSDs=[38, 29, 89] num_large_omap_objects=10 b! M: x5 W& U$ P) H, m
pgid=13.50 OSDs=[68, 19, 59] num_large_omap_objects=15 O- x7 U9 p# O6 Z# A/ i; S  q6 s
pgid=13.6b OSDs=[39, 79, 8] num_large_omap_objects=1! s: y9 k' V* v: m
pgid=13.8e OSDs=[38, 9, 78] num_large_omap_objects=1
1 j0 U- Q. Z& z: j% opgid=13.d1 OSDs=[9, 88, 38] num_large_omap_objects=15 b9 `. g# ^9 f3 @
pgid=13.d2 OSDs=[59, 88, 28] num_large_omap_objects=15 r5 }4 ?  U2 s
pgid=13.e1 OSDs=[19, 88, 49] num_large_omap_objects=1
; O" S! R* Y# @0 B5 c1 gpgid=13.e4 OSDs=[38, 19, 89] num_large_omap_objects=1
. c! [4 a, J4 T5 zpgid=13.e7 OSDs=[19, 89, 38] num_large_omap_objects=1
9 ?. R% H; y3 t" }+ y7 xpgid=13.ec OSDs=[89, 28, 48] num_large_omap_objects=1
; _7 T( J8 m' mpgid=13.f5 OSDs=[38, 88, 19] num_large_omap_objects=1
+ a7 b/ g$ L8 F# I  z复制% M- p$ L9 G, Q4 \1 [2 X* C/ b0 g
查找OSD日志,确定object名称(".dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.11"),发现omap条目数达到了2378492,超过默认告警值5 ~, T1 F6 e2 |% F' J6 ], _4 C

! k3 R) T9 w, Z% P1 |5 X" M$ h[root@demo123 cephuser]# zcat /var/log/ceph/ceph-osd.19.log-20181231.gz |grep "omap"
6 I  b  E+ [$ C% e2018-12-30 23:00:42.334766 7f6583f44700  0 log_channel(cluster) log [WRN] : Large omap object found. Object: 13:87443b2d:::.dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.11:head Key count: 2378492 Size (bytes): 491722758
' [" e3 V0 J* ]- {/ g, z: w6 P复制
1 B. W2 I- k9 j5 G# T. v: k默认告警值为2000000,2378492>2000000,不建议去修改这个默认值,因为改得过大会加大集群出现异常的风险,属于掩耳盗铃。
; U3 o/ @; i( ^! v( }3 i% f4 D. k  v" |0 v6 ]
[root@demo123 cephuser]# ceph daemon /var/run/ceph/ceph-osd.19.asok config show |grep large
! ~. F2 L8 w$ D2 x! N, ]    "osd_bench_large_size_max_throughput": "104857600",
9 P; C; c5 U7 [( [* r: F    "osd_deep_scrub_large_omap_object_key_threshold": "2000000",; @) F! j4 @- [( e  r" _: ~
    "osd_deep_scrub_large_omap_object_value_sum_threshold": "1073741824"," q; L+ c" Q5 O5 J+ r7 |# g
复制
8 G/ f9 L) ]% t6 [5 \5 m% R查看一下发生omap过大的bucket,确定相关信息: e. c( |, M1 |

& C' F5 P$ d) P/ q& K, f5 W[root@demo123 cephuser]# radosgw-admin bucket stats --bucket=demo1/ e. k8 K1 o1 |8 _$ U# h
{
  S. v. S$ ?' G  f( s    "bucket": "demo1",5 Z  Q: c& M+ d$ ^
    "zonegroup": "68f1dcf5-0470-4a48-8cd2-51c837a2cafb",0 q" z5 a- a* ^8 ~/ d% T* h
    "placement_rule": "default-placement",. c! w- n7 T4 l7 X3 _
    "explicit_placement": {; I: s3 [! B2 E; A% x3 L% _
        "data_pool": "",8 ~# V1 W1 G2 O6 N
        "data_extra_pool": "",
# a3 ?# {+ d& z* L* E& X        "index_pool": "", Y1 M% e; o  L( ]! U$ l1 {' j& M
    },4 V9 L) j  u  p: i% X
    "id": "afd874cd-f976-4007-a77c-be6fca298b71.34209.1", #当前bucket instance ID,  G# N3 E: V4 H% ]
    "marker": "afd874cd-f976-4007-a77c-be6fca298b71.34209.1",; o$ w6 _4 E2 L7 O5 i4 [
    "index_type": "Normal",/ M+ P8 T9 `* i7 X# t) ]) e
    "owner": "s3test",
1 c! n2 s; D0 u( f. v2 {9 n9 M    "ver": "0#2638037,1#2637965,2#2632835,3#2632869,4#2632799,5#2632597,6#2633289,7#2633175,8#2637227,9#2637609,10#2637997,11#2632455,12#2631337,13#2631624,14#2631983,15#2632359",
- u+ {/ h% x: I4 j    "master_ver": "0#0,1#0,2#0,3#0,4#0,5#0,6#0,7#0,8#0,9#0,10#0,11#0,12#0,13#0,14#0,15#0", #16个shard
  e% Y; \5 i& _( H    "mtime": "2018-11-28 16:47:45.560039",5 k0 O3 C9 t) s6 Y" k* X2 C
    "max_marker": "0#00002638036.2638608.5,1#00002637964.2638536.5,2#00002632834.2649479.5,3#00002632868.2633634.5,4#00002632798.2633370.5,5#00002632596.2633168.5,6#00002633288.2633860.5,7#00002633174.2633747.5,8#00002637226.2637798.5,9#00002637608.2638181.5,10#00002637996.2638569.5,11#00002632454.2633026.5,12#00002631336.2631914.5,13#00002631623.2632195.5,14#00002631982.2632554.5,15#00002632358.2632930.5",+ L+ J' I" ?; \- \) w1 f+ Y
    "usage": {6 _" G9 n$ D. w3 Z
        "rgw.main": {' w1 u/ ~0 B8 e6 r4 _% O& z
            "size": 1975757355553,
' [/ A7 l( \, w$ ~            "size_actual": 2047893610496,
" d" c0 s0 A' z9 `            "size_utilized": 1975757355553,2 D/ }1 U5 S; v; M. m
            "size_kb": 1929450543,
' x& j0 `: J' I/ ]" R( a( R5 I            "size_kb_actual": 1999896104,- s' x. U7 D$ H3 `7 R' q
            "size_kb_utilized": 1929450543,
5 J. u$ q/ O& L* _8 A            "num_objects": 19998962 #近2000Wobject" J! o: }/ o: q  v5 @
        }* b1 Z! t: [# B
    },
6 |: d. X% b2 u    "bucket_quota": {
; S# L5 C) V( S5 l8 X6 J        "enabled": false,2 B) K0 z1 A2 t7 y) }/ Q. {2 t$ Z! i
        "check_on_raw": false,! r6 t  i5 U( t+ f, Y
        "max_size": -1,' \- `6 U. a) B: A% n, M- O
        "max_size_kb": 0,9 H4 S# u3 O1 ^5 G+ R
        "max_objects": -1
) N) v" ]) f5 C5 w' [    }
! D3 I! x' b8 [}
4 m+ m& E2 m9 G9 J  {2 k  J! w% t& I$ J复制/ J/ @1 k7 C7 g9 M4 u$ @
异常处理" |' y: q8 n6 r# R4 Z
通过bucket reshard操作,将原来的bucket 重新划分shard,shard数量从16->64。注意reshard有风险,最好停掉客户端的读写操作以后再进行,同时如果你使用了multisite,请根据官方说明立即关闭Dynamic resharding特性。7 Z$ Y2 z  a+ D6 b
  L% X4 ]7 ?, G' ]7 a5 N" D) g
Dynamic resharding 说明: http://docs.ceph.com/docs/mimic/radosgw/dynamicresharding/( k6 _% L/ h# M

" `6 _' m: u6 u& G7 \# `$ O9 n7 G做完reshard需要手工删除之前的索引数据,工具也提示了下面的内容。
" f+ g. p5 Y. {. ^$ n" F6 ~7 |* _6 k8 v2 g4 M/ T
[root@demo123 cephuser]# radosgw-admin bucket reshard --bucket demo1 --num-shards 64
+ ]0 g( C/ A4 y4 }0 C) e, m$ _*** NOTICE: operation will not remove old bucket index objects ***1 [8 S0 B: S) C# L8 k7 I
***         these will need to be removed manually             ***) r$ m) q5 D# w7 c
tenant:- e( c! _  S. X6 y
bucket name: demo1
+ S, o7 Z4 ^" ^/ }  S$ Dold bucket instance id: afd874cd-f976-4007-a77c-be6fca298b71.34209.1  ^- Q& K* u0 W/ r( T
new bucket instance id: afd874cd-f976-4007-a77c-be6fca298b71.45786.1
. ^& O5 O' v: l- v+ ztotal entries: 1000 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000 199989620 s' a' q# ^9 G; I0 ~
2019-01-03 11:42:33.741314 7f74d15c6dc0  0 WARNING: RGWReshard::add failed to drop lock on demo1:afd874cd-f976-4007-a77c-be6fca298b71.34209.1 ret=-2
- r2 C5 l( u1 h/ l7 h7 e复制
' {' b/ m' {' \  f. H8 f检查reshard结果& y/ ?7 o6 a) F7 y2 d
4 |! Q" {$ X( E0 d
[root@demo123 cephuser]# radosgw-admin bucket stats --bucket=demo15 T7 ?# p* m. }# F
{6 [& |* t3 L6 \. {7 d! X( c& U2 t
    "bucket": "demo1",
6 b5 {, B" G4 |5 A6 s) @0 L    "zonegroup": "68f1dcf5-0470-4a48-8cd2-51c837a2cafb",' |  b1 m* a3 }0 K- o; m. k& j
    "placement_rule": "default-placement",
( e: ^( c: l: j" g5 `    "explicit_placement": {) j7 [- L1 }. V
        "data_pool": "",/ y9 q( N3 R+ o' U8 W
        "data_extra_pool": "",6 l; D" X+ [7 B$ P! _0 D* a6 P
        "index_pool": ""+ c: z/ G! I& w. v7 K
    },( Z3 i$ G( M' E3 M9 O) J
    "id": "afd874cd-f976-4007-a77c-be6fca298b71.45786.1", #bucket instance ID发生变化# Q; v' v! ~% d" `% k
    "marker": "afd874cd-f976-4007-a77c-be6fca298b71.34209.1",
& `* K/ \9 w' X8 q, \1 b9 E/ _    "index_type": "Normal",  r# e$ `- o. @) D" A
    "owner": "s3test",4 ]2 c" s2 m7 R
    "ver": "0#4920,1#4920,2#4883,3#4877,4#4882,5#4883,6#4885,7#4880,8#4882,9#4880,10#4878,11#4883,12#4923,13#4883,14#4882,15#4874,16#4878,17#4880,18#4884,19#4881,20#4882,21#4881,22#4876,23#4922,24#4883,25#4887,26#4881,27#4879,28#4879,29#4879,30#4882,31#4884,32#4880,33#4879,34#4917,35#4876,36#4883,37#4885,38#4884,39#4879,40#4883,41#4880,42#4880,43#4882,44#4884,45#4877,46#4879,47#4877,48#4881,49#4880,50#4881,51#4881,52#4883,53#4876,54#4880,55#4884,56#4881,57#4885,58#4882,59#4881,60#4881,61#4881,62#4883,63#4882",#shard 数量变为了649 `3 V/ U: ?( \7 t4 |, O) h
    "master_ver": "0#0,1#0,2#0,3#0,4#0,5#0,6#0,7#0,8#0,9#0,10#0,11#0,12#0,13#0,14#0,15#0,16#0,17#0,18#0,19#0,20#0,21#0,22#0,23#0,24#0,25#0,26#0,27#0,28#0,29#0,30#0,31#0,32#0,33#0,34#0,35#0,36#0,37#0,38#0,39#0,40#0,41#0,42#0,43#0,44#0,45#0,46#0,47#0,48#0,49#0,50#0,51#0,52#0,53#0,54#0,55#0,56#0,57#0,58#0,59#0,60#0,61#0,62#0,63#0",% `+ {, Z* @& ^8 m
    "mtime": "2019-01-03 11:32:50.349905",
6 r0 ~$ \+ L4 d/ |    "max_marker": "0#,1#,2#,3#,4#,5#,6#,7#,8#,9#,10#,11#,12#,13#,14#,15#,16#,17#,18#,19#,20#,21#,22#,23#,24#,25#,26#,27#,28#,29#,30#,31#,32#,33#,34#,35#,36#,37#,38#,39#,40#,41#,42#,43#,44#,45#,46#,47#,48#,49#,50#,51#,52#,53#,54#,55#,56#,57#,58#,59#,60#,61#,62#,63#",
* W. B3 C/ o8 f9 w) K5 W    "usage": {
( a; y; i' x7 J# [        "rgw.main": {
- M# l  U3 b* x5 y; u* n            "size": 1975757355553,) l' D7 u: f) P8 o" p3 q  E
            "size_actual": 2047893610496,
5 A% L& `# Y* {/ G2 I            "size_utilized": 1975757355553,
' C  L+ E: _2 ~) @! v1 J+ g            "size_kb": 1929450543,) w3 W5 u0 Q. c
            "size_kb_actual": 1999896104,0 T4 T" Y* A, Q" l
            "size_kb_utilized": 1929450543,+ I7 s" \# `: o( Q, Q, t0 ]
            "num_objects": 19998962
" m5 |/ i/ |" l0 W  _        }- x1 }9 i& H3 U8 O
    },
# B. N4 x5 V$ q* f0 P    "bucket_quota": {5 ?2 j$ C: p* R' c* y
        "enabled": false,! U/ O/ N# M1 f
        "check_on_raw": false,* O) V2 p! T; d' |6 x2 L9 }
        "max_size": -1,$ F& r5 b# \1 X' t4 C$ i, c# o/ X
        "max_size_kb": 0,
4 i5 i; s8 {6 {        "max_objects": -1
/ u- h' o4 l/ [% {2 B    }0 T% I5 G0 l! v
}
" H& S. w! u/ C, _" i复制$ V- J% G3 V1 k
回收旧数据4 P2 w) |* Z9 R: E2 u
根据之前工具的提示需要回收index和meta两个pool里面的残留数据2 p; {! C5 i7 t8 Q' z0 X+ P
; r! L7 o  W7 X# `; G9 Z
回收index pool数据
! Z. M. Y) s* P( W" o5 I4 i4 q0 H' C+ `5 E2 E
[root@demo123 cephuser]# rados ls -p cn-bj-test2.rgw.buckets.index|grep "afd874cd-f976-4007-a77c-be6fca298b71.34209.1"4 J9 g4 J) }$ P' K' o
.dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.5+ Y. S; C) `3 ^
.dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.15% K: V3 e7 K; m( t9 i
.dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.2
% S( a- p* A7 w3 ]( H: @! P.dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.1* t+ d$ [* J3 k3 c4 t0 m3 ?
.dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.0
& h$ b6 Q$ ~, q- [, s0 \.dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.4
- H5 L7 g0 O$ l+ L8 Y  O2 K.dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.11
0 T- v0 L6 E6 e5 A6 g: z/ i4 p.dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.139 t# n9 R& s- @4 m' x0 A+ o2 h
.dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.6
3 M* }* }* c9 g5 u' i% i2 J8 B.dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.3
2 Q1 p: S2 Y' ~; {; S.dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.7
+ n5 \+ O% q$ Q. m4 x/ x" N.dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.98 z- p6 J# k  H1 G) w
.dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.14- e+ y/ b8 p* ]% o/ Z* }& q
.dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.10& O; M5 U' ~1 p: j
.dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.12
4 t7 H3 U: s0 i' f; T. _0 d* G.dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.80 m9 u( o* F' Y" L/ D" G0 B  G
复制' |' F4 Z; U2 P4 `
使用rados rm命令删除数据6 G6 w8 {3 M  k% u

; m. ~3 R6 L9 Y; v! ^- m[root@demo123 supdev]# rados ls -p cn-bj-test2.rgw.buckets.index|grep "afd874cd-f976-4007-a77c-be6fca298b71.34209.1"|awk '{print "rados rm -p cn-bj-test2.rgw.buckets.index "$1}'|sh -x/ g7 z/ ^8 v. t' K8 H5 c8 k" v
+ rados rm -p cn-bj-test2.rgw.buckets.index .dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.56 t  \7 n0 U5 h" z- o+ {; R" ]
+ rados rm -p cn-bj-test2.rgw.buckets.index .dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.15
/ a  p2 c1 B1 x; J) S+ rados rm -p cn-bj-test2.rgw.buckets.index .dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.2
$ x2 W! Z4 I+ N1 l4 j8 t+ rados rm -p cn-bj-test2.rgw.buckets.index .dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.1
! e. Q! N% q, A3 D% a" T# ]; ]+ rados rm -p cn-bj-test2.rgw.buckets.index .dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.0
% M& Y$ M/ p9 U, |4 C  Z+ i+ rados rm -p cn-bj-test2.rgw.buckets.index .dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.4
; P9 d3 W4 x0 N" e  ^+ rados rm -p cn-bj-test2.rgw.buckets.index .dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.11
8 E% ~) m  ]8 A  b* g( t+ rados rm -p cn-bj-test2.rgw.buckets.index .dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.13
0 m$ |" |- @3 c; Y6 k( k$ s0 X+ rados rm -p cn-bj-test2.rgw.buckets.index .dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.6/ U. s9 m# u" C/ E" H; C# O9 p4 v  s
+ rados rm -p cn-bj-test2.rgw.buckets.index .dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.35 i$ O* n0 I. l+ C
+ rados rm -p cn-bj-test2.rgw.buckets.index .dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.7
% n# P7 O2 y8 n& K& ?+ rados rm -p cn-bj-test2.rgw.buckets.index .dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.9: @( ~% r0 \( d  U5 E) L$ {1 V& p
+ rados rm -p cn-bj-test2.rgw.buckets.index .dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.14
1 r: M$ a. W/ y3 _$ u+ rados rm -p cn-bj-test2.rgw.buckets.index .dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.104 [: j' L3 C3 g
+ rados rm -p cn-bj-test2.rgw.buckets.index .dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.123 [. t. v- m0 p
+ rados rm -p cn-bj-test2.rgw.buckets.index .dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.8
% m2 \/ ]4 j% W# W' {; v- p复制9 |- l% |0 [, X  k7 r
回收meta pool的数据" h4 O( K. Z% K, V9 ~9 B# r
7 h$ Z" r+ u' Q) l: ^* d- h
[root@demo123 cephuser]# rados ls -p cn-bj-test2.rgw.meta --all( `0 V5 x- @, X+ p  z! _
root    demo1
) _' D3 K8 |& i- c+ v2 U/ z# eroot    .bucket.meta.demo1:afd874cd-f976-4007-a77c-be6fca298b71.45786.1# @: W8 X. _, \9 q/ p9 o% H# y( {
root    .bucket.meta.demo1:afd874cd-f976-4007-a77c-be6fca298b71.34209.1 #残留' ~' f) c+ O) q% r" h  N3 I
root    my-new-container_segments
: d1 R) W- p. V3 kroot    .bucket.meta.demo2:afd874cd-f976-4007-a77c-be6fca298b71.34353.18 a4 h! F6 {" F! n
root    .bucket.meta.my-new-container:afd874cd-f976-4007-a77c-be6fca298b71.7991.1: R2 @" N$ e( Q/ [* _
users.uid    s3test.buckets/ q5 L& H/ O3 J& ~" T
users.uid    swiftuser/ f) A8 S( n2 N) A8 x. `6 p; }
users.swift    swiftuser:swiftuser1
+ e" R) \2 w1 M: p: qusers.keys    SNACA4LX9DS21NGMSRX4
0 n" c3 h* I6 mroot    .bucket.meta.my-new-container_segments:afd874cd-f976-4007-a77c-be6fca298b71.7991.4
/ g9 C0 h; m5 t' g; ]+ X: D0 w5 n2 Yusers.uid    s3test
3 R% Z+ R, t, ^root    demo2
7 {3 c1 l* A0 V( ^/ Iusers.keys    XP8E2452AB6EBU3RPD0C
* M' o6 {- L5 {$ I# g  O* w* @root    my-new-container0 r/ Z; V9 T- {6 q& d; k& {
users.uid    swiftuser.buckets
- J, }( v% M5 Q7 ousers.uid    synchronization-user
$ p! o& n9 l" Q( W复制
5 [( K3 B9 @5 z) |' t3 r- q! T注意这里用的ceph L版本,使用了namespace,所以要指定namespace才能删除; O9 w" I; p/ J) y5 I8 T4 q

& r$ e) X- o0 y) t[root@demo123 cephuser]# rados rm  -p cn-bj-test2.rgw.meta .bucket.meta.demo1:afd874cd-f976-4007-a77c-be6fca298b71.34209.1 --namespace=root8 A% `9 Z1 g- c' I8 @+ r8 T) s
[root@demo123 cephuser]# rados ls -p cn-bj-test2.rgw.meta --all" o+ [0 z8 ^' X3 t
root    demo1
: M0 c) }( B/ u' \root    .bucket.meta.demo1:afd874cd-f976-4007-a77c-be6fca298b71.45786.14 x1 V7 A. i. L" ^( B& {+ g
root    my-new-container_segments$ j, z1 V- ~- ]; t' F- x
root    .bucket.meta.demo2:afd874cd-f976-4007-a77c-be6fca298b71.34353.1
7 V2 [1 a8 r; {; e' A+ `' iroot    .bucket.meta.my-new-container:afd874cd-f976-4007-a77c-be6fca298b71.7991.1
. Y) w9 r( B: ^1 Y# w  zusers.uid    s3test.buckets
$ T; i/ @; I, g/ K5 D3 rusers.uid    swiftuser/ `8 r+ J# N6 w/ K! |0 c- i
users.swift    swiftuser:swiftuser19 \/ w$ V# B) c, M& ]4 A
users.keys    SNACA4LX9DS21NGMSRX4* Y( B3 O' D6 ?  M' Q! w# }
root    .bucket.meta.my-new-container_segments:afd874cd-f976-4007-a77c-be6fca298b71.7991.4& M+ \+ c& F4 H% k, D
users.uid    s3test
" P" i# Z* f, z. ?3 o! a, D1 nroot    demo2
. u+ V( D2 O) d* {1 Busers.keys    XP8E2452AB6EBU3RPD0C
- H- E" d& e! M4 ]2 M4 |! Troot    my-new-container3 m& K% h' @1 B* T# w; [
users.uid    swiftuser.buckets
- e# l/ u# H7 k/ _users.uid    synchronization-user
) o: k" [4 N1 V0 e  ?& C复制3 Q5 O  I/ q0 [6 s7 _
清除large omap告警
8 O7 Y$ Z0 D0 G# M删完了object并不会恢复告警,需要手工对相应的pg进行deep-scrub操作,具体如下
! W0 P) B/ ~! _/ z1 I2 p+ `# x  z: M0 k+ @; ~' e
[root@demo123 cephuser]# python large_omap.py
) P5 u* u! ]4 [3 R3 `4 aLarge omap objects poolname = cn-bj-test2.rgw.buckets.index
: g* z6 n1 ~2 d* l% p( `pgid=13.33 OSDs=[59, 79, 19] num_large_omap_objects=16 x7 t/ E  ^5 u' F7 l* \
pgid=13.3c OSDs=[49, 29, 78] num_large_omap_objects=1
, i/ p2 T( Y4 F+ B2 I1 Spgid=13.3d OSDs=[48, 69, 9] num_large_omap_objects=1( ]; @0 b/ R  j8 g! n& D$ I
pgid=13.45 OSDs=[88, 39, 28] num_large_omap_objects=1
/ p! \, w6 H" q$ ]! d/ Y' ]pgid=13.4d OSDs=[38, 29, 89] num_large_omap_objects=1
1 z$ D0 n9 O! R$ l: b9 E- qpgid=13.50 OSDs=[68, 19, 59] num_large_omap_objects=1
0 s2 P$ |% n6 p+ Epgid=13.6b OSDs=[39, 79, 8] num_large_omap_objects=1+ |' G6 f- I& z5 L8 Z, r, ]
pgid=13.8e OSDs=[38, 9, 78] num_large_omap_objects=1
" \/ B. _: r: O/ q" Z4 Tpgid=13.d1 OSDs=[9, 88, 38] num_large_omap_objects=12 u% s2 i* v1 i
pgid=13.d2 OSDs=[59, 88, 28] num_large_omap_objects=1. [! `' [" l7 L
pgid=13.e1 OSDs=[19, 88, 49] num_large_omap_objects=13 n+ S1 O& y$ H( o7 f% v3 V
pgid=13.e4 OSDs=[38, 19, 89] num_large_omap_objects=1
( n7 X. |2 [. C' i2 l' y9 a! Spgid=13.e7 OSDs=[19, 89, 38] num_large_omap_objects=1! j( a# f# a# y0 @* H3 N
pgid=13.ec OSDs=[89, 28, 48] num_large_omap_objects=1
( A' i* B; Z$ c* {4 H" k0 Opgid=13.f5 OSDs=[38, 88, 19] num_large_omap_objects=1
. {# K1 ~5 q1 e% w3 d  N[root@demo123 cephuser]# ceph pg deep-scrub 13.33
- ?) h) A/ ]: D1 q$ ninstructing pg 13.33 on osd.59 to deep-scrub
5 R# ]7 U0 f* T, Q: Z. h4 D1 k复制
4 {6 {2 {' R" d1 |' `, j操作完可以看到有pg进行dep-scrub,之后状态恢复
6 E! j8 f% x) o( t
' `9 E6 A7 [( s( H' P$ U. Z* o+ b[root@demo123 cephuser]# ceph -s
, [2 y" H! N  z; o  cluster:4 p# q) ?. Z, e: i
    id:     21cc0dcd-06f3-4d5d-82c2-dbd411ef0ed9& U2 |" ?- J1 r$ W$ N7 h
    health: HEALTH_WARN5 ^! r% p9 x& ~) R; J1 h
            16 large omap objects% E- O' j2 p  X" _9 S9 z. L4 s

, e5 O5 b8 Z8 h  services:
0 Y- L* h7 @% x) C# I1 E# ^    mon: 3 daemons, quorum demo122,demo131,demo141
7 Y8 T7 R9 n2 o, h: c2 p    mgr: demo141(active)
; ^; ~, q+ \/ G! z2 X5 _# V' g    osd: 90 osds: 90 up, 90 in
7 x( _8 v& u0 ^/ X% j    rgw: 1 daemon active
# @; b* k9 G/ o6 H% i# @4 o  T. u
' ]9 w; v- l3 f7 e1 |$ V3 `$ T7 |" H  \  data:
% _0 o* H. [/ ~* a9 V    pools:   7 pools, 3712 pgs8 f1 m+ u! V: P1 R- f. z- [
    objects: 20.13M objects, 1.80TiB# d# n0 V4 D7 c1 H$ D9 s
    usage:   7.28TiB used, 408TiB / 415TiB avail0 E& G4 ]# `: [1 z/ U
    pgs:     3711 active+clean, B& i0 K; h3 q/ {+ I/ A
             1    active+clean+scrubbing+deep #开始deep scrub* T* L2 B0 e% P

& _$ R8 p2 Z; Q  io:
! T, r8 P& T. M  V    client:   5.29MiB/s rd, 935B/s wr, 69op/s rd, 28op/s wr
2 k; V9 _" R* |% I% u2 O6 O6 o& D0 P  P, }
[root@demo123 cephuser]# ceph -s
9 q! {/ i0 r& ]% ^2 b9 l  cluster:
. P- ^. c: ^+ o' K. n    id:     21cc0dcd-06f3-4d5d-82c2-dbd411ef0ed9; L) W) `5 k2 P) W! e4 A! O1 c
    health: HEALTH_WARN
/ _, e3 p: S( N  n            15 large omap objects #减少了1个9 u' O' C7 r' g: I) B$ R
1 n$ X, w7 n. f+ \# I
  services:
1 C% v" q# g* r% W    mon: 3 daemons, quorum demo122,demo131,demo141. l1 }7 C; d9 F/ L% m
    mgr: demo141(active)
9 I. e% g) ]; C) |4 Q3 B2 Z& N6 i    osd: 90 osds: 90 up, 90 in" A0 F" T0 v& R# ]- D
    rgw: 1 daemon active' U" J0 W6 ~5 t! _' h- G6 c

4 j0 [1 M- M$ x- g( B; j  data:
0 r1 f9 J$ M2 \4 v1 ]: R    pools:   7 pools, 3712 pgs" m, c* H, O& q
    objects: 20.13M objects, 1.80TiB
" r! N* y- r, }. S5 b4 L" B    usage:   7.28TiB used, 408TiB / 415TiB avail
( w3 Z7 B6 o: `4 B    pgs:     3712 active+clean" G" i! h' d2 j5 W' J5 x

7 O/ q0 N1 i) ^9 i( i; `) ^  io:
$ ]) d& P. G+ Y' `$ e7 B- @* P    client:   5.33MiB/s rd, 680B/s wr, 36op/s rd, 6op/s wr
0 N! `5 ?/ \2 b9 h复制
3 Z3 d! S8 @* F7 i6 \: i总结" {( z- a" p" A2 t8 v* V* E
index pool的omap告警一般就分为两类:8 {9 Z# I& ~3 q$ R

& B& S! j" W) J# |% A一类是object条目数过多,导致对应的index 元数据条目数过多,可以用上面的方法处理。
9 r8 u1 Y8 k4 g# a2 P另外一类是bilog过多,这里的方法就不适用了,需要手工进行bilog清理,关于bilog后续会有详细章节介绍。
$ E9 ~% Z+ U( q" Y% ~) n3 Q. I
8 L/ G7 i9 {. _! ^0 W

1

主题

0

回帖

12

积分

管理员

积分
12
QQ
 楼主| 发表于 2022-8-23 09:54:43 | 显示全部楼层
线上multisite环境出现HEALTH_WARN 32 large omap objects,已经bucket auto reshard=false,所以排除是bucket index 所在的shard omap过大引发的问题,官方的给出的告警信息无法定位到具体的object,于是有了下面的排错过程
% A, Q" ]6 L: r* f1 H* |- B
( [2 Q& o% [$ r; ]3 n$ i排查过程2 \7 {/ }' O1 N
[root@demo supdev]# ceph health detail
# `* ]4 O7 K5 uHEALTH_WARN 32 large omap objects
; R$ x. B' W: s; G6 XLARGE_OMAP_OBJECTS 32 large omap objects
/ u% F0 o' P) u! y    32 large objects found in pool 'cn-bj-test1.rgw.log' #出现large omap的pool
2 }5 ?' i" K( m    Search the cluster log for 'Large omap object found' for more details.7 @* V, `2 T$ m- `. @3 o" d* d

& y* e" i+ p/ h/ U; E
3 K; N$ z* [& O3 f" L2 c  l! s3 Q[root@demo supdev]# ceph pg ls-by-pool cn-bj-test1.rgw.log |awk '{print "ceph pg "$1 " query|grep num_large_omap_objects"}'|sh -x$ h1 i# |5 V7 t6 Q: X7 _# y; S) o( z1 Q
ceph pg 11.0 query|grep num_large_omap_objects
/ v7 |/ W' n: w, A: ~ceph pg 11.1 query|grep num_large_omap_objects
6 Y) K5 u* K# e% zceph pg 11.2 query|grep num_large_omap_objects$ b" c) k" o2 r
......
! r& a- \1 j) I6 o& H( V+ ceph pg 11.1e6 query2 P+ E2 ^1 {3 _3 L$ J1 U
+ grep num_large_omap_objects2 Z6 _9 @. ~  ^* @8 y" x
                "num_large_omap_objects": 1 #有large omap的objcet数量
& U, I( Q: ~1 f. c" J+ y! U                    "num_large_omap_objects": 0
! u  n* b& c3 |' l! a                    "num_large_omap_objects": 05 _6 _0 {- i' f( i

# Q0 D' Z- {8 c- P1 K
, q, x0 }* M7 C  j6 u[root@demo supdev]# ceph pg 11.1e6 query #查询pg详细信息$ U7 [- D9 [( a* g5 g/ ]6 X+ u
{8 m9 `- I, f& E+ C+ \' O; @6 g/ `) t
    "state": "active+clean",1 s+ J7 ^* W. W+ M
.....
# n* `+ H; T; \    "info": {8 H! X3 u6 G9 f/ W$ M8 r# x/ q5 @  V& g' D
        "pgid": "11.1e6",  w0 v, T, D$ n2 u
        "last_update": "10075'3051746",: g9 ^7 |: u/ c
        "last_complete": "10075'3051746",
) v1 D, [* h6 d3 d# W" ~: g4 c" ?, e        "log_tail": "10075'3050200",+ i2 x1 r0 J6 e: B* r
        "last_user_version": 3051746,! C. M5 x1 J4 |7 e
        "last_backfill": "MAX",
- A8 J" f/ E8 ~        "last_backfill_bitwise": 0,
& q! G; _6 w7 Z7 n9 Z) G: |/ P) X( y        "purged_snaps": [],
& |3 T; H& i; \, S.....
- Z  P7 q5 C; D' H* J2 @# k+ Z  z+ ~  [  o: v! u. B4 o8 b' H" p
              "acting": [% S: g; D) d  f+ F, P0 |
                    46, #主OSD id=467 O0 A* B+ {4 c$ w" b
                    63, #从OSD
4 G( ~- T; X' g                    23  #从OSD
% ]$ f! O& F" X9 Y' F& G                ],, C9 _- [  k+ D
            "stat_sum": {
" X, E! ^+ D% T1 T9 J# x* {! V                "num_bytes": 40,
; D) o! B( v* @1 g- Y- G                "num_objects": 2,6 u1 Z1 d8 i& Y! p
                "num_object_clones": 0,
/ ~5 l: _; z$ }# @                "num_object_copies": 6,$ L% y/ a/ Z1 p
                "num_objects_missing_on_primary": 0,
3 [) ]4 N" ^  k& v2 I4 q: Y                "num_objects_missing": 0,
) f( B/ u8 X! h9 `1 y" H                "num_objects_degraded": 0,
) g' I0 h4 [8 ]5 n                "num_objects_misplaced": 0,
; Q2 ^5 H/ G3 X8 E  l                "num_objects_unfound": 0,+ l# }& D7 y5 I- K( s/ D3 ]
                "num_objects_dirty": 2,4 |% S) e( u, Y5 \' t  N" K- b" C
                "num_whiteouts": 0,
( I5 D+ V. e' C" O  [                "num_read": 3055759,: q' }$ M; ?9 H: `$ l
                "num_read_kb": 3056162,
: i& J& O; `7 w  K1 X                "num_write": 5986011,8 A# c4 Z5 N+ j* }- A' B# u& ?
                "num_write_kb": 53,
4 B  [, |# |# B  L) ?                "num_scrub_errors": 0,8 R7 G4 e3 E: Z* ]2 p! j( X+ E3 Z
                "num_shallow_scrub_errors": 0,
; S) w* {! u7 D                "num_deep_scrub_errors": 0,! k# ?5 i$ O- b$ ~! D
                "num_objects_recovered": 0,
9 f0 }' j5 Y- M$ F2 g  Z9 ~! v                "num_bytes_recovered": 0,
; c6 Z8 V2 s) t" ]* h# y                "num_keys_recovered": 0,# y, O7 B$ z, H! f3 T# z; f
                "num_objects_omap": 1,
; m0 b/ D; d4 t" ^8 g                "num_objects_hit_set_archive": 0,
7 x/ Z& j4 k. l- W* a' U. T                "num_bytes_hit_set_archive": 0,
; M' q% |* k# u                "num_flush": 0,
  S) L1 ?2 d# y                "num_flush_kb": 0,
" J  \: i! S, z/ ?. m                "num_evict": 0,
8 u* ~1 a; ^3 H; z8 b                "num_evict_kb": 0,
/ [* s4 l' A8 c0 m- P' N- v                "num_promote": 0,
# _/ w: {2 }  `  y+ k* X                "num_flush_mode_high": 0,
0 S0 o5 w& y! J0 D$ p9 z6 C  ?: [5 O                "num_flush_mode_low": 0,
+ ?3 o7 x9 }% J8 N. S' J                "num_evict_mode_some": 0,
. R' @. U& e& O" a                "num_evict_mode_full": 0,
+ x, N) [" j+ H* R- ?: r, \4 D4 ], m                "num_objects_pinned": 0,
6 ?/ ^8 b1 F8 {8 ?7 p+ t# `- w                "num_legacy_snapsets": 0,
- n% s/ e5 U# ^: q                "num_large_omap_objects": 1 #large omap的object数量
' D) J% O" K! z8 B            },
) q7 Z# ~$ Z) X+ U) ]5 _; N* L( H            ...
1 L3 {2 K7 K6 z' z                "agent_state": {}
7 w$ Y9 i) s0 Q# h# V}" ?" t4 h% Q8 E$ T' n. H! }! @  `
" a5 h- `9 K# T/ Y

' l4 W* }) g: M/ Y[root@demo supdev]# ceph osd find 46 #根据OSD id找到对应的主机信息
- \2 R- X' }" g# m4 G{
+ x2 G! x$ \) q2 q) H- d    "osd": 46,: S$ F) x9 N$ t# p  f2 ^& ?
    "ip": "100.1.1.40:6812/3691515",
, O8 g6 d! M/ i1 P. U    "crush_location": {1 [; I9 s7 A; F
        "host": "TX-100-1-40-sata",3 X- J# e: J- Y
        "media": "site1-rack2-sata",, w# K5 _, V' t5 b+ o
        "mediagroup": "site1-sata",
* `5 L; k/ Q* }7 ]% F/ r' t0 L8 u+ e        "root": "default"* c- S0 U) P8 V, t5 Y
    }
+ Y0 F- a1 f( ^/ |2 e}5 B& S5 `, I% v
- q  l/ p/ n  j
( U% o" l# v5 f/ K. J/ z
[root@demo supdev]# zcat /var/log/ceph/ceph-osd.46.log-20181210.gz |grep omap #根据OSD日志找到具体的object名称
1 _1 s1 w+ ~( ^$ S! c8 a2018-12-09 23:03:18.803799 7f90e9b46700  0 log_channel(cluster) log [WRN] : Large omap object found. Object: 11:67885262:::sync.error-log.3:head Key count: 2934286 Size (bytes): 657040594 : d) G) s' `- d6 @# ^. Q6 p  i
#OSD 46上的object名称为sync.error-log.3的omap超出标准
" a# ]6 x4 r$ d" s# h) P! ^  E- r& a5 s' g" R' t$ ?5 W2 o. b
" N5 R' J, _  o* ?; x

7 K1 E6 ^: O9 n[root@demo supdev]# rados ls -p cn-bj-test1.rgw.log|grep "sync.error-log.3$" #确定objects存在
- T9 ~, [( S! M1 Y" V4 psync.error-log.3' \5 g  s4 I$ c

& i! p3 h6 A: Z) }) P$ t: p$ n- K#注意整个multisite的同步过程中的错误日志信息以omap形式存储在sync.error-log.*
; A0 ^- f( G/ O+ H/ X#吐槽一下,错误日志分32个shard存储,代码写死了,而且错误日志目前还只能通过手工清理,无法像其他日志一样自动trim,随着错误日志不断堆积,才引发了今天的问题。5 J' s8 X2 }( f5 h( d
0 q- B, J+ L# r) W6 V5 R8 j/ f
[root@demo supdev]# radosgw-admin sync error list|more#查看错误日志7 K  U, l2 e; H# I, o
[
- J- l& w2 v6 u) g6 j6 y    {
! _( ?0 j% E! x* J, \  i        "shard_id": 0,
+ I8 ^) F& t; n  O8 D        "entries": [
5 _2 n9 w+ {! q/ r9 K7 {: C1 |            {3 k4 k+ q$ M/ w2 A' [1 o
                "id": "1_1540890427.972991_36.1",
5 B) P$ T: u) H! y                "section": "data",# h# L/ Z+ b4 |' u$ C" k& \  O$ }
                "name": "demo2:afd874cd-f976-4007-a77c-be6fca298b71.34353.1:3",: V. [9 x4 D3 j; u4 e& t% {- ^
                "timestamp": "2018-10-30 09:07:07.972991Z",+ X0 i$ h: F3 q+ a4 @
                "info": {
9 k) n& L: x+ G& I9 r' f/ v: }                    "source_zone": "afd874cd-f976-4007-a77c-be6fca298b71",
8 T* t% A" U9 P1 W$ S7 X3 d                    "error_code": 5,
1 b6 m, T/ o& d, O                    "message": "failed to sync bucket instance: (5) Input/output error"
2 e0 K/ X% @0 k0 w/ o                }
  E) g* g# s) n7 D            },1 l% t5 r2 M6 m+ M2 @. h
......
$ Z0 _* y+ o9 c9 [6 O9 Q9 g* \            {( r$ S9 f% q0 ~* D; b
                "id": "1_1543395420.626552_32014.1",
7 |8 K* H8 Q- u. c' V                "section": "data",/ Z+ T! A2 x! S- |
                "name": "demo1:afd874cd-f976-4007-a77c-be6fca298b71.34209.1:0/file1205085",
; d1 A0 u& {% f4 k! j/ {! P                "timestamp": "2018-11-28 08:57:00.626552Z",
3 }2 h: [; o' C& G9 V/ P                "info": {
$ C3 Q9 r% X! M  o. K8 z4 c* v$ Y                    "source_zone": "afd874cd-f976-4007-a77c-be6fca298b71",7 k6 o/ u. i" G
                    "error_code": 5,
* _2 Y9 p7 Z$ c3 d                    "message": "failed to sync object(5) Input/output error"
+ [8 O7 B7 O& k5 U                }
* Q! v, r) U" D1 Z2 v/ h) y            }
6 Q( W; K. ?& w6 D
1 j: o4 G0 r, \* Q8 j8 b$ {. r( Y
[root@TX-97-140-6 supdev]# radosgw-admin sync error trim --start-date=2018-11-14 --end-date=2018-11-28 #按日期清理错误日志记录1 ^* Y' a2 }8 m2 p, V
复制0 }; y2 \: o/ {8 N. q8 }
优化定位效率
7 Q: `3 t+ N3 b9 z( {+ g* K# u* u简单写了个脚本,先根据warn信息找pool,之后再根据pool找出有large omap objects的pg,凑合用,不保证没bug,在12.2.10下面测试通过。- C% k9 H/ N9 |
+ h4 q% N5 ~8 ^3 c2 N3 a/ w8 g
[root@demo cephuser]# cat large_obj.py# W. W9 K0 M3 J: R8 N
import json9 f: j+ E8 |' N5 F. t9 x
import rados
% E# P1 R! m1 [0 {! a% limport rbd0 ~  `7 |( v/ |/ u: `( ?7 j- H' s

2 |# R) T/ k- v( ?ceph_conf_path = '/etc/ceph/ceph.conf'
) I2 q" A$ Q+ D& ~+ srados_connect_timeout = 5
" u2 Q, L5 U7 |/ U3 B! u- q, i0 }) O! g" V; g
class RADOSClient(object):& |" S6 ~" T' n  ~6 ?2 ?: G# F
    def __init__(self,driver,pool=None):
  z8 u8 \: _! t; j/ g" R        self.driver = driver& k# }0 ^7 N5 Y/ o& ~
        self.client, self.ioctx = driver._connect_to_rados(pool)
/ S8 ~# u# w2 s    def __enter__(self):
+ X+ W! H5 i- v' \5 J+ I3 K        return self8 Y- o! f( P7 U' r" Q% X2 o$ u
    def __exit__(self, type_, value, traceback):
; \5 s/ h1 N; o9 V        self.driver._disconnect_from_rados(self.client, self.ioctx)4 o# }6 |/ X! R' q( d4 |: V9 @
9 x6 A7 {9 Y1 w. {* `
class RBDDriver(object):8 S. c. t" L6 W
    def __init__(self,ceph_conf_path,rados_connect_timeout,pool=None):
6 ^! G. O; n3 q' H2 ~8 g        self.ceph_conf_path = ceph_conf_path
6 e# I" q0 \" z) R5 I6 {: W6 S        self.rados_connect_timeout = rados_connect_timeout
+ T5 V6 G4 `4 Y8 }# x        self.pool = pool3 G4 j& O9 t7 k# l# t5 K+ ^
    def _connect_to_rados(self, pool=None):
4 S8 r0 S$ \% v8 _' y8 x        client = rados.Rados(conffile=self.ceph_conf_path)
9 ?+ I1 x  j3 b9 M' i4 {1 f        try:
9 f1 T# [' Q; G: Y/ ]4 o, G, V            if self.rados_connect_timeout >= 0:, t' a3 R, p* {9 D
                client.connect(timeout=
% q4 U  G) x1 ^. R                               self.rados_connect_timeout)
# g. @" B; g7 B- H7 }4 Y9 T            else:1 C0 @& W1 z/ u# q: D7 m
                client.connect()
+ [2 A7 I* f3 }3 G4 a; K& C7 s            if self.pool == None:3 m3 x) j) r9 i* N9 E4 Z; U
                                ioctx = None
9 D6 d: O& k9 F; a% b3 {/ ^            else:
- G$ k- U3 T4 c5 s" `! O" m                                ioctx = client.open_ioctx(self.pool)
( N  f' e( U& h, n            return client, ioctx& |" E* k& [9 f" ]" k( c8 `7 P! y
        except rados.Error:( ]; _8 _# G: p& b6 J1 a5 I
            msg = "Error connecting to ceph cluster."
& [/ u; ]( f$ o0 ^, M7 d7 D( k            client.shutdown()
- j) a, ]6 C- g5 Z            raise msg6 Y4 |; H1 b( V) J
! t! }5 w" m1 w6 i; e0 b3 F8 E
    def _disconnect_from_rados(self, client, ioctx=None):" g. e% x3 ?2 q4 _
                if ioctx == None:1 b2 X; o1 X( k. W0 s: X  Z3 i5 T. N
                        client.shutdown()
# Y' u9 X3 _' u( m% S0 C                else:
( e6 m+ k: n' A# j6 m& Z4 t+ J: M                        ioctx.close()" |5 g% s+ v0 d8 S; z7 J
                        client.shutdown()
! L0 J  s! o1 q1 x1 ]  [
2 `  h, z' U$ Yclass cmd_manager():9 U) z% [" [! a
    def get_large_omap_obj_poolname(self):
8 [$ t) I: `) Z5 A* c        with RADOSClient(RBDDriver(ceph_conf_path,rados_connect_timeout)) as dr:' J& ~! a2 }1 o. `' y+ l  n/ W- p
                result = ''
8 [$ j3 J( p  ^' W: [: R2 d                cmd = '{"prefix": "health", "detail": "detail", "format": "json"}'
1 o0 ^/ e' B& o* x, x                result = dr.client.mon_command(cmd,result)
  _9 l; o! H* y; O                if result[0] == 0:  C, N# r* T5 [- O; y! d
                    res_ = json.loads(result[1])
. h% ^4 R' t* r1 T: l                    if res_["checks"]['LARGE_OMAP_OBJECTS']:
" w2 D. ]  ~1 n) N$ q0 g. B                        return res_["checks"]['LARGE_OMAP_OBJECTS']['detail'][0]['message'].split("'")[1]
6 K5 H2 V% d2 Z2 c/ I" c                else:
8 {4 t6 F) C3 Y4 ^8 L- `                    return False& w+ ^. h; t$ k, M. t- J
    def get_pg_list_by_pool(self,poolname):
, _5 S; _( s0 R' P        with RADOSClient(RBDDriver(ceph_conf_path,rados_connect_timeout)) as dr:
' P( _' f9 E. Q; c" g                result = ''* ?! G9 d  E' H' ~# y
                cmd = '{"prefix": "pg ls-by-pool", "poolstr": "' + poolname + '", "format": "json"}'5 n( v7 B* ^, W% R/ j0 O
                result = dr.client.mon_command(cmd,result); ^. v2 }- x! X# H. D1 n. l. L
                if result[0] == 0:6 n) q  H2 z. b: z% H* X3 Y
                    return json.loads(result[1])
7 F( S9 U5 \" n" D                else:
8 c( |) `7 g1 c+ g; i! x9 i1 J                    return False/ P( S. z- a  C% u5 |

: z, a  M/ v" w: m  b, _cmd_ = cmd_manager()8 \' C+ G$ E( u0 }2 h5 H0 T- o
poolname =  cmd_.get_large_omap_obj_poolname()
/ i, A) F1 ^' `! d8 Yprint "Large omap objects poolname = {0}".format(poolname)0 s: j  F- U, k; c5 k
res =  cmd_.get_pg_list_by_pool(poolname)
* c+ {3 C( g3 jfor i in res:! f) b+ p5 E7 C4 }: ]
    if i["stat_sum"]["num_large_omap_objects"] != 0:/ N( v5 @4 U  Y/ @# X! ^7 {
        print "pgid={0} OSDs={1} num_large_omap_objects={2}".format(i["pgid"],i["acting"],i["stat_sum"]["num_large_omap_objects"])! p% U5 n  j+ s$ ~
复制
& I* X, q# P* u1 I+ [再爆一个雷
+ C5 k7 W% H4 m如果你认为通过上面方式清除omap集群就能立马恢复状态,那就太天真,告警信息“HEALTH_WARN 32 large omap objects”依然挂在那里不尴不尬,虽然omap清理了,但是因为对应PG状态没更新,所以告警信息依然存在,只能通过手工或者其他方式去触发PG的状态更新,我这边是通过ceph pg deep-scrub {pg}去触发pg信息更新,注意如果你用scrub是没用,必须deep-scrub,这里又要吐槽官方的逻辑设计,真是WFK!当然你也可以放那里不管,等后台自动deep-scrub也能恢复。
. C; V% K+ K& `/ h' b' Y, ?5 C/ h9 c6 A
您需要登录后才可以回帖 登录 | 注册

本版积分规则

返回首页|Archiver|手机版|小黑屋|易陆发现技术论坛 ( 蜀ICP备2026014127号-1 )

GMT+8, 2026-6-12 00:06 , Processed in 0.025576 second(s), 23 queries .

Powered by Discuz! X5.0

© 2001-2026 Discuz! Team.

快速回复 返回顶部 返回列表