|
|
楼主 |
发表于 2022-8-4 13:39:30
|
显示全部楼层
1、前言# H! }. y2 p5 w9 F6 G! x9 v( z8 A( m
这里实验在使用cephfs时,如果cephfs的元数据损坏或丢失了,那该如何恢复出用户数据。下面就为大家演示下如何恢复。, F$ `9 B2 r+ k' @2 U5 p; k
( v. E6 ]! b8 ?( C
2、准备测试环境
- m& [* g1 i6 {$ [! z) J! T- E2.1、准备测试集群, ^& [, S5 F/ m) W7 T
我是基于L版本做的实验,J版也是可以的。测试环境如下:
4 I+ n( g, g7 V/ z, m
/ }2 O8 }- r! g* ]2 \2 n( H' U; B[root@ceph05 ~]# ceph -v
9 r: e5 u8 Y9 v' W. ? Lceph version 12.2.11 (26dc3775efc7bb286a1d6d66faee0ba30ea23eee) luminous (stable)
. u- `8 ^9 Q2 }# |2 |/ q$ Q: d! B, b5 x! @4 y: m% t
[root@ceph05 ~]# ceph osd tree
7 J) l9 |& `( g0 @ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT PRI-AFF 5 O/ S+ X% K3 |+ _8 K
-1 0.07397 root default 2 h; P P- z2 @" ~
-3 0.03699 host ceph05
: J1 a' J& c' h2 U- h# g 0 hdd 0.01799 osd.0 up 1.00000 1.00000 : Y1 l* R8 |& `4 f- W
1 hdd 0.01900 osd.1 up 1.00000 1.00000
" `' i+ ~8 o! |-5 0.03699 host ceph06
/ v0 K4 c, s+ I0 `( n1 C- M" S 2 hdd 0.01900 osd.2 up 1.00000 1.00000
T$ w- \. n+ x: g6 X 3 hdd 0.01900 osd.3 up 1.00000 1.00000& v4 k3 v# ?2 v& N+ q6 [. h/ g7 X
9 x6 }( \5 |/ @ `0 c0 I: ~
[root@ceph05 deployceph]# ceph -s/ G _, Y, b+ `
cluster:. b" y1 U( P- b. ?! c& B" K
id: 176feab8-ca22-47bf-b809-202deac53c6f6 W+ A% M9 {6 }) w
health: HEALTH_WARN+ s' T+ n m+ `) `. G) S+ m/ s. P
crush map has straw_calc_version=0
) J5 l6 B3 ^ L. K O6 I. d+ @) W5 c
services:' j' [; v5 V1 T7 h9 h/ {, o
mon: 1 daemons, quorum ceph059 u0 o& s, Z' K+ z% }1 i O
mgr: ceph05(active)* x& c) o& h3 u7 i1 h% ]
mds: cephfs-1/1/1 up {0=ceph05=up:active}/ T1 Z; m7 q( o& r$ G2 S
osd: 4 osds: 4 up, 4 in+ r l- F$ q1 `; j7 C9 }
- v' w2 \: x2 r5 |' D7 i
data:4 W4 l; d- }. t) h: R+ \1 T
pools: 10 pools, 304 pgs
- g3 ?" _) B7 Z5 Z objects: 918 objects, 2.60GiB. R6 `- z1 p& ^3 F Q5 }" U
usage: 6.70GiB used, 71.3GiB / 78.0GiB avail E) h* g1 I, [2 p$ n% @) x% @9 D
pgs: 304 active+clean
* D$ a1 K# }+ K2.2、准备测试数据" I* R( o$ [/ w4 O" P1 Q7 ~ }/ y1 e
挂载kc5 P8 ~; H9 [& @
! w5 s. u; r6 K7 ^) x7 X
[root@ceph05 deployceph]# mount -t ceph 192.168.10.30:/ /cephfs
+ R/ Q+ i0 B. w3 Y6 w2 |[root@ceph05 deployceph]# df -h|grep ceph
) s2 T* X1 ^. Q( m3 n) R5 y; o# L···
' H8 I8 [; S# e! h7 `192.168.10.30:/ 78G 6.8G 72G 9% /cephfs, J m9 h) e. {: g
···" q, B) j% R8 }& c8 h; `2 Q
写入数据(这里我写入了几个比较有代表性的文件类型:txt、jpg、png、pdf、word、excel)# |' p. c0 c5 y! k
+ A) q+ Q9 @$ D L+ h
[root@ceph05 deployceph]# ll /cephfs/
7 k5 V+ {2 f# o7 ]total 59128 B ?/ {- X c/ n
-rw-r--r-- 1 root root 31232 Mar 15 12:18 111.doc" C3 U# T' B- L; ]) c1 I
-rw-r--r-- 1 root root 20593 Mar 15 12:18 22.xlsx
" y7 v9 ?2 q I- d' j2 m-rw-r--r-- 1 root root 12494 Mar 15 12:17 5be23a3eec2c0.png: q, L2 y' q* E( u
-rw-r--r-- 1 root root 3189 Mar 15 12:17 cmap.txt
5 B2 b7 _9 k) p-rw-r--r-- 1 root root 5985243 Mar 15 12:17 hello0.pdf7 ]7 c0 I4 @" ^. J! R
3、模拟故障
3 t6 \7 N; L1 O, t P* s这里直接模拟元数据丢失的情况,删除metadata池里面所有的元数据对象:
2 d3 M1 T7 ?. p" K( o- X: H" W
; m) F% J- V9 j0 G[root@ceph05 deployceph]# rados -p metadata ls|xargs -i rados -p metadata rm {}# t/ a* J9 s* B* r: v
[root@ceph05 deployceph]# ceph df
3 }: m( o- F' ^: SGLOBAL:
+ o! a, Z; A6 c8 b( Q1 S SIZE AVAIL RAW USED %RAW USED
+ D4 R9 I' }+ z 78.0GiB 71.3GiB 6.71GiB 8.60 $ `% ?6 c' f e, e
POOLS:+ l$ x9 @9 }9 \, [/ F
NAME ID USED %USED MAX AVAIL OBJECTS
4 J8 |( g, {0 b: q .rgw.root 1 1.09KiB 0 63.3GiB 4
1 e/ v9 _6 H# ?8 m8 n e default.rgw.control 2 0B 0 63.3GiB 8 c9 l, S# Z4 J7 z7 }% \
default.rgw.meta 3 720B 0 63.3GiB 5
) E% B* r7 K3 O" h, j default.rgw.log 4 0B 0 63.3GiB 207
& F* I/ K" d& M8 ^* F( e2 h' m# Z% d& u default.rgw.buckets.index 5 0B 0 63.3GiB 1
/ w% J2 f! [. w7 _ default.rgw.buckets.data 6 1.02KiB 0 63.3GiB 2 3 O0 V( b; |' l' Q
pool01 7 2.60GiB 3.94 63.3GiB 666 ) \, k7 \0 ~3 }! N+ v% y# P
rbd 8 36B 0 63.3GiB 4 # K4 }7 l) V; g# M) x9 T# K
metadata 17 0B 0 63.3GiB 0
* @- {, |) Z2 n data 18 5.77MiB 0 63.3GiB 6
8 D0 Q% |/ S: t- B- X6 L8 l: ~看到metadata池里面没有对象了已经,重启下mds看效果,因为mds里面会缓存元数据信息,所以要重启下mds:
$ n s0 p0 G( U1 \. Q" y6 L9 _% k8 ` u/ j; z
[root@ceph05 deployceph]# systemctl restart ceph-mds@ceph05
) R3 d8 `! m5 _5 c5 T[root@ceph05 deployceph]# : n; n6 Q# Y( c7 ^" y8 u( a
[root@ceph05 deployceph]# + e2 V! Y/ y5 ^6 y
[root@ceph05 deployceph]# ceph -s+ a% X' O; E3 W
cluster: y( Y* u" z i5 V, {# l3 E
id: 176feab8-ca22-47bf-b809-202deac53c6f
1 J4 p. c. R/ T2 N3 x& Y health: HEALTH_WARN
# c/ m* w. a: M% S* Z6 i& R/ b0 V 1 filesystem is degraded7 }9 W% @/ y8 Z! h" Y
1 filesystem has a failed mds daemon
$ t- l8 e6 C0 y& C1 i crush map has straw_calc_version=0% N. D E- u7 r
* t# o8 J- ^; s
services:
0 C- |9 @4 E' J, } mon: 1 daemons, quorum ceph05
% m; u2 n2 [& b6 H5 m mgr: ceph05(active)
1 Y1 R6 P% m. i: {- Y mds: cephfs-0/1/1 up , 1 failed8 A9 G8 L) u9 {. M) w
osd: 4 osds: 4 up, 4 in
6 `- L/ Y$ y. m* W' S7 B! G V
: f1 e( D3 @2 e data:& X( @( y! ]5 k" N9 K7 C* W
pools: 10 pools, 304 pgs
2 M7 T3 x# i6 n% f+ e/ j" [( H" R3 ` objects: 905 objects, 2.60GiB, ]0 j. z7 x5 n) @0 f+ @
usage: 6.71GiB used, 71.3GiB / 78.0GiB avail# N+ A+ u- k# Q' P" i
pgs: 304 active+clean8 g: O- V. I1 p- ?( D
看到集群现在不正常了,访问kc里面的数据卡住,说明数据已经无法正常读取了。0 c" X5 }) J3 B# v
( J m, I2 f! E, m7 Q4、开始恢复0 J- @8 s% I: U7 B
使用我编写的py脚本(文末给出了源码)恢复,把脚本放到集群任意一台节点上执行:. }" x6 _5 y8 u: z8 E
! r. b- N* v, k* o" L3 x+ L
[root@ceph05 rcy]# python recovery_cephfs.py -p data! L4 B7 {9 L2 E% E0 G9 o+ E
-p指定cephfs的数据池,运行完之后会在当前目录下产生两个文件夹和一个运行脚本的日志文件。
! A9 I. U/ m8 \% c6 W. @
2 ]1 ]3 z8 K0 y) L# s2 h[root@ceph05 rcy]# ll
: T( L" C: h& d3 |total 16
3 \/ ?( e) _" m0 q-rw-r--r-- 1 root root 3826 Mar 15 13:57 recovery_cephfs.py
2 {( b* k, y Zdrwxr-xr-x 2 root root 120 Mar 15 13:57 recoveryfiles
" j9 ^' {: {# R+ u4 F2 V2 N" C-rw-r--r-- 1 root root 4804 Mar 15 13:57 recovery.log$ `/ t4 K) G; S3 H$ _ F
drwxr-xr-x 2 root root 4096 Mar 15 13:57 recoveryobjs7 i& o( _: J) @0 n; Q2 Z1 J
查看恢复出来的文件在recoveryfiles文件夹下:
4 X5 [; A# u( U- F2 G) o" Y3 m5 Z
Z- |, O Z" P[root@ceph05 rcy]# ll recoveryfiles/
* f0 [+ |: ?' X3 T' `9 Wtotal 12364
X% J; q. j" p3 Q" z) X+ g7 t-rw-r--r-- 1 root root 5985243 Mar 15 13:57 10000000000-pdf2 [8 [: e! T6 P" N9 t/ m
-rw-r--r-- 1 root root 3189 Mar 15 13:57 10000000001-text
' o, s1 q6 q: @-rw-r--r-- 1 root root 12494 Mar 15 13:57 10000000002-png& n* n7 B1 V) X
-rw-r--r-- 1 root root 31232 Mar 15 13:57 10000000003-text$ w/ @7 G# v8 K; }
-rw-r--r-- 1 root root 20593 Mar 15 13:57 10000000004-excel7 G% O; m& B0 K) e1 A* ^
文件名格式为”文件在cephfs里面的inode-该文件可能的类型“。恢复出来的文件名后面会给出该文件的类型。这样就可以使用合适的软件打开该文件来验证文件是否完整。8 x( W6 l/ W7 X+ M1 B2 X* a
: B) ?3 \& Z6 o
5、总结
) C5 t' k' l- |2 w2 @4 w' E' x9 c. ?在cephfs文件系统的元数据完全损坏的情况下,只要数据池对象不丢失,就可以恢复出完整的数据。恢复的思路如下:) i' S2 N, r& K/ T; |9 B' A( M
% N" S) V/ `* X& \" ?% b+ |1 ^获取数据池对象
0 E- U# ~$ O, Z. e4 _" B根据inode找到该文件的所有对象
4 X% G! s+ {( ~/ `拼接对象
* s8 e* G" M: Y6 A5 Z; W5 Z使用脚本注意事项:
: M7 W+ H1 t- L, I
/ v W) q6 O% [0 I6 U. O+ w现在的脚本在只加入了txt、jpg、png、pdf、word、excel这些文件类型的识别,需要其他的就需要自己加入到脚本里面了
9 u* `7 U& [, }8 R只适合副本池
+ m c; J5 O3 h- o如果数据量特别大,不适合使用脚本,不过可以参考脚本的思路去一个一个文件恢复
% G- m h4 I1 D$ A+ q4 h6、脚本
, H, ~9 Q6 j3 @$ u# coding: utf-82 p( l, g. [$ e K _' |+ }" x
import os
4 ]* a* a- j" n% }: q2 Z, Aimport shutil# n$ p L$ ]8 P8 {* O
import json4 b" ]9 _! P* H8 ~3 \
import sys
- g# n4 g. H* L( A4 T0 H2 U4 j& mimport subprocess9 R$ ]# o: c, a' _0 }( N; U
import copy
0 Y) i9 s# e2 @3 p( e3 D0 Jimport logging
6 D) B5 b* i/ M1 q. Eimport argparse
% d9 i) u" a* ~9 C: ]' L! d$ U
$ N8 u. m. e1 B: y+ @9 F0 f# M__auth__ = 'ypdai' L) D' M% L2 s% R! S1 J, `$ p
{6 [& ^1 r& W( o8 HSLEEP_INTERVAL = 1
5 l, W4 y8 L; N7 ilogging.basicConfig(filename='./recovery.log', format='%(asctime)s : %(levelname)s %(message)s',
2 i' |: [1 w- w F) y3 Z: ]5 k level=logging.INFO, datefmt='%Y-%m-%d %H:%M:%S')% p, I* k. J2 ^: I) `/ p [
# X! n6 d4 l# x
BASE_DIR = os.path.dirname(os.path.abspath(__file__)), _ ^* y; R, M: n& N; b: k7 e
RECOVERY_OBJ_DIR = os.path.join(BASE_DIR, 'recoveryobjs')
, B$ t5 O6 N8 G* o0 a! ARECOVERY_FILE_DIR = os.path.join(BASE_DIR, 'recoveryfiles')
) o; C g8 y$ X/ W
9 {' P9 F9 K: C2 e. c, ]% Z1 D
2 [" D8 r+ w. M2 L) sdef exec_cmd(cmd):
8 r# o! t5 x& k' n' U) S """8 O9 a# B/ J x ^6 q9 {
执行shell命令,并返回标准输出和执行状态码
3 |# o1 u9 G- R- Y0 ~ :param cmd:
0 |! F: A% D+ x3 s :return:
B) c3 i6 K" S5 F """3 B& a8 @5 h$ q; O7 \
logging.info('exec_cmd():: cmd: {}'.format(cmd))
& |% Z4 r, |% y p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT); A3 n+ I( ?8 g: g* y* v" i
p.wait()1 {* L! k. [- z4 B" C
out = p.stdout.read()6 V) D$ L( g$ g3 w, ^. d9 H8 P) u. \
code = p.returncode
; j: Z- b7 o8 a6 M" Q) L% V4 ]0 N logging.info('exec_cmd():: cmd exec out: {}, code: {}'.format(out, code)); e: }7 p' _2 w
return out, code6 z3 o6 Z7 e* J+ J. t
. h4 C8 S" `9 w# V
( h# g6 Q5 t5 a0 X a4 e; @def prepare(pool_name):
; V s9 \1 N( B$ p5 v! s& v& [ """
1 p1 Y: a3 \0 f& h 准备恢复条件2 ]( ?0 N, x ~3 r! P) g- v; f8 k% B
1、检查所给pool name是不是cephfs的数据池
% p) E( x( H0 ^3 @& ]$ S 2、创建recoveryobjs和recoveryfiles文件加4 o) ], J1 Z" }8 ]. E* g, u2 O! B
:param pool_name:
" m; N( O* j% _ :return:
" w; [: O. V( j% a& B. F5 B; R """# H; @+ I& w0 W4 ^% f4 D
cmd = 'ceph fs ls -f json-pretty', Q; i3 t2 b- H. D) A; r; E
out, code = exec_cmd(cmd)3 T& X3 M0 R" [/ ^8 t& L
out = json.loads(out)
7 m! d+ X" ~5 ]2 [; t for cnt in out:7 p9 ?1 F5 H" G+ X: o- e4 V
if pool_name not in cnt.get('data_pools'):
/ L. V' O/ `+ q0 c" Z. u3 s9 M return False
0 V8 K5 X1 A: \
& f" ^, G9 P$ l2 r, B7 B if os.path.isdir(RECOVERY_OBJ_DIR):" D, O) W( ^, O" r
shutil.rmtree(RECOVERY_OBJ_DIR)! {4 K' L7 y4 r
os.mkdir(RECOVERY_OBJ_DIR)* W1 p9 u% f) G3 S2 J0 X
2 G. [( M( t0 B1 a- K# S# s, i, B if os.path.isdir(RECOVERY_FILE_DIR):0 P+ ?$ y/ \/ t- G2 s- i7 E
shutil.rmtree(RECOVERY_FILE_DIR)3 S _+ B s6 \7 p& G
os.mkdir(RECOVERY_FILE_DIR)2 o) I$ f+ J+ ~' {0 f4 {& m$ t# t
9 \! ~/ J* R/ T% p: \+ z7 `
return True( t5 t/ W! t6 M# i: g3 {' L; N
2 n7 @6 ~% P2 }6 b! y. `: A
1 {9 T t" i! `0 Rdef get_file_type(file_path):: U( e+ K! D( s V
cmd = 'file %s' % file_path
- N$ Q& F; I) u2 { out, code = exec_cmd(cmd)6 ?* A. k6 G e% J4 _. W G
out = out.split(':')[-1].lower()& x- u( N1 o; {% [
; N0 B m! U" h4 ~: M
file_type = 'text'
8 w4 K! J& r) ^# N. L0 g if 'word' in out:
0 B6 c/ ?8 R) a+ z file_type = 'word'
8 o o' }( s+ {6 D elif 'excel' in out:
2 z6 i, M5 H5 L! l; L( ] file_type = 'excel'+ i* i2 z+ g# R. I/ ?# @
elif 'pdf' in out:
* Z# ?* {( {4 ?7 } file_type = 'pdf'. \ \0 U0 V- W+ |
elif 'text' in out:
5 Y+ L" I* E* i file_type = 'text'
, `5 V0 E7 N2 s6 e! v$ p y5 l elif 'jpeg' in out:3 E" o) ~( D- _8 q
file_type = 'jpg'
P5 ~: v/ X. s) S; \+ w5 v6 Y elif 'png' in out:$ r' S, D8 u5 I3 C X& y+ l
file_type = 'png'5 G8 S3 |: N! z! z9 I: c9 q
. L7 m/ A! F2 A+ _ return file_type
# k1 e6 |/ j+ A& j& ]1 Z4 e/ [+ ~, J6 d& x( k ]
/ z1 Q' n! [6 U! X, H% j
def do_recovery(pool_name):0 q+ ~' T7 \" ~9 C8 t" a9 x
"""2 ~: r# H! X- @' |; m
具体执行恢复,大概恢复逻辑如下:" S: ?: h* b3 i
1、从数据池里面获取所有的数据对象
6 p3 L4 z. E4 b 2、找到每个文件的head对象,然后把数据这个文件的其他对象内容写入head对象里面
+ e8 h; V, \( ^1 `* y 3、根据head对象的文件类型,推测该文件的实际类型
# [+ _) t* x# I( _ :param pool_name:8 }! Y8 o& D( p2 H) m5 U
:return:
$ O" m! T6 m' p9 m """( Y& R# x& d# z4 _+ D
cmd = 'for obj in $(rados -p %s ls);do rados -p %s get ${obj} %s/${obj};done' % (& n2 i5 j/ {. `8 M8 ^
pool_name, pool_name, RECOVERY_OBJ_DIR)) G, U/ t% v1 `2 R
out, code = exec_cmd(cmd) U) P8 x* s; N, ?. Y! t
5 W% _; O( {/ S& B, p# D1 D
if code != 0:
4 ]! J! ]4 q, \+ T8 X# Z6 p logging.error('do_recovery():: get obj from rados failed.')
* G; [) P/ k, m5 p. e2 u return
1 H. m$ Z% O! b: l
6 o. M, H9 T: L2 T- x, T cmd = 'ls %s' % RECOVERY_OBJ_DIR: `) C3 I/ R. K5 H
out, code = exec_cmd(cmd)) \$ P0 z; [: g, @; X. V
if code != 0:! k" x" g% h$ D/ R z
logging.error('do_recovery():: list obj failed.')% s8 E! v9 R, D6 u/ }( p
return
, M; t$ l5 q5 L6 k4 C% Y% T& i
o6 x) d! t# L( k. |9 T$ y9 s% S done_lst = []
8 n8 i* i9 M; W* b% v8 A# i' ?$ m objects = out.split(); D7 U3 `: I8 g# m0 ?# ~& n
for obj in objects:7 K' P# x/ L' d7 V% g" |0 E' W
inode, number = obj.split('.')+ X8 x6 B8 v4 B8 C% M/ l
if inode in done_lst:9 a1 h5 n. ~4 G3 f2 X' ~$ X" I
continue
! ~: j& \7 H4 |/ x$ j* ?" g0 B4 d+ x- N# h7 r2 ]
cmd = '''ls -l %s | awk '{print $NF}' | grep ^%s |sort''' % (RECOVERY_OBJ_DIR, inode)
6 a/ h. u; F# N1 i& _$ c- w out, code = exec_cmd(cmd)
9 }- @7 B6 Q9 j8 a8 N- b% a9 v files = out.split('\n')
) Q& P5 l* O, d5 e( [
3 M) w# H0 {5 z2 s: e head_file = files[0]8 f2 Q/ J+ h1 _
file_type = get_file_type('%s/%s' % (RECOVERY_OBJ_DIR, head_file)). a. e! k1 x/ |6 T2 ^. x
cmd = 'cp %s/%s %s/%s-%s' % (RECOVERY_OBJ_DIR, head_file, RECOVERY_FILE_DIR, inode, file_type)& k5 o& U* [5 u4 B( E( _' k
out, code = exec_cmd(cmd)
& m! ]) S' Y: ~( G for f in files[1:]:
- p- o) Y y$ `1 ] if not f:& F4 i& R M+ j; i
continue8 ^' J2 W% t" N. {
cmd = 'cat %s/%s >> %s/%s-%s' % (RECOVERY_OBJ_DIR, f, RECOVERY_FILE_DIR, inode, file_type)- X# n( V0 S9 y/ f
out, code = exec_cmd(cmd)
5 ^8 F% \/ n3 Q% W/ m! U U
& ~2 a6 z& J# i- P done_lst.append(inode)' h! p2 |# A4 p0 A" {8 a
9 L1 w, I$ W# O4 f
9 o' F# U" k( R/ i; k9 |1 x
if __name__ == '__main__':
* [: o% N# \$ j, z* P1 h7 D parser = argparse.ArgumentParser()
# O) \) V6 E( [- o Y1 n parser.add_argument('-p', '--pool', required=True, type=str, dest='pool',, S1 j! I2 ]4 `5 A6 [ M
help='select given cephfs data pool by name')# D* A- @. y) k6 \ W' l2 f, x
args = parser.parse_args()
) r. C7 ^& m$ F7 Z I/ f- i0 p
4 H$ n4 u4 f9 y if not prepare(args.pool):0 X. V" w' t8 h: @# R
logging.error('main():: invalid pool name.')& Y/ t2 l! C( u3 r% ?
sys.exit(1)
* K+ T+ l: t$ b1 b/ U4 M& q* w3 z2 X8 x. l2 Q. k! a
logging.info('=== main():: recovery start'); _* Z- y' T D' _& {
do_recovery(args.pool)# \* R, S. m. x: o3 o, h- C+ `( L
logging.info('=== main():: recovery done') |
|