|
|
尝试二、修复down掉的osd
, @$ W+ Y! z4 K7 V# Y8 h) s; b3 K w9 c3 x% o
该方法主要应用于某个osd物理损坏,导致激活不了
% c/ M/ V' I* ^9 l- y3 C6 }* X O! K( y/ }$ ^$ l& p+ Y- E+ z$ u
1、查看osd树8 ]. t" V. [! O6 _$ c6 u4 n
3 Z, q, U7 q% n7 Y( P
复制代码: G+ Q6 z- F. g& k9 f3 ?. Q1 G. D
root@ceph01:~# ceph osd tree4 D* b' @# ^( I
ID WEIGHT TYPE NAME UP/DOWN REWEIGHT PRIMARY-AFFINITY + N I% `; Y" k# Z6 F, o
-1 0.29279 root default 9 S N+ d1 x1 D+ R, d s$ }
-2 0.14639 host ceph01
, E$ |: ]# b* J4 H( }- ?; o4 y 0 0.14639 osd.0 up 1.00000 1.00000 7 j8 Y) E5 K! T% i
-3 0.14639 host ceph02
& R' X( U# _2 _) o" Z 1 0.14639 osd.1 down 0 1.00000
/ y! a: w/ u# y4 a* ?! L e复制代码
. g8 `% j- ?$ N% I发现osd.1是down掉的。
4 `: c& H: A5 V& @4 ^2 ]
- R& k( k) X3 A# ?; N2、将osd.1的状态设置为out6 z& i5 @1 [- X, f s8 w
! h+ B4 h% W2 R% H# b5 F+ ] nroot@ceph02:~# ceph osd out osd.13 f0 q8 C0 R0 q2 s# v+ E
osd.1 is already out. 1 c* [5 t* _5 N" J/ C3 D0 i
3、从集群中删除! Z" b! e1 k9 Z
; `- o+ h1 V8 f% l4 g$ ^. f
root@ceph02:~# ceph osd rm osd.1
0 @9 T. |8 W) I. q# ^removed osd.1 o q6 K) \! u
4、从CRUSH中删除
; `8 {! U6 a+ L) \6 P) w( W$ e+ z1 _+ v0 O, v- k, _
root@ceph02:~# ceph osd crush rm osd.1 " r/ o! L7 p* Q/ J
removed item id 1 name 'osd.1' from crush map2 O9 J. y" Q8 \: T
5、删除osd.1的认证信息
: [+ ~! ?0 S; m& M, C
. o) V! N% K# [% iroot@ceph02:~# ceph auth del osd.1- Y! |. a& W+ v+ X
updated
! }' }& z$ O2 R9 d! q6、umount0 h$ h) S* { N% R
) d" k0 W; l4 O8 h5 f/ _% ~
umount /dev/sdb1' `& x' z- i- t
) p8 A. J7 x9 g t- v [8 S0 ?, q7 E) u d# i7 |. M
7、再次查看osd的集群状态; Q1 }7 F9 o( w8 T' {
8 e3 N0 u- U7 Y m! O6 m
复制代码0 w- A$ v& M I' H- \& C
root@ceph02:~# ceph osd tree
1 k/ J0 D4 L5 X/ ~; r0 e; x% }* X3 AID WEIGHT TYPE NAME UP/DOWN REWEIGHT PRIMARY-AFFINITY 3 d0 o# n) k, g' D5 j9 {! s
-1 0.14639 root default
0 V2 o- L' R0 c+ L0 `; F; ~-2 0.14639 host ceph01
1 q: Q( P) M" L+ U& } 0 0.14639 osd.0 up 1.00000 1.00000 5 M1 y+ d6 `' d1 i. ]) \
-3 0 host ceph02
; ` w, S( d4 J x复制代码
2 y( B A" [( ~# m. T8、登录ceph-deploy节点* e$ c3 h& C" ]1 u1 I7 P3 m1 C
5 f G) g/ |: d) froot@ceph01:~# cd /root/my-cluster/( u8 }0 S9 q0 Q6 D' @' q) y
root@ceph01:~/my-cluster# : A7 c# I y6 X6 Y1 x4 d
9、初始化磁盘& y [1 s- U! T, ?6 w" V. w! v
8 u6 |# B% M2 J6 k9 ?/ B) i" p( T4 y* U$ g' A$ ~4 `
ceph-deploy --overwrite-conf osd prepare ceph02:/dev/sdb1
5 ~2 U6 n0 @. W( x$ q( O* d( _4 @! @" Y7 a- ~. d$ e
在后面的版本中需要重新执行添加osdceph-deploy osd create node1 --data /dev/sdb/ T ~6 U% f. \8 S1 L
+ B: p- s: `! ~
10、再次激活所有的osd(记住是所有的,不只是down掉这一个)
" t1 b" K0 I3 M8 f) c7 B9 i: |
" e3 g& E! ]4 _ceph-deploy osd activate ceph01:/dev/sdb1 ceph02:/dev/sdb1
7 q# h6 ?' C& ?+ E, g9 ^( ~11、查看osd树和健康状态
1 h" u. X) O5 B0 T+ P+ q5 C
8 A3 e, }6 m$ D1 M0 c6 ~ n# l8 A复制代码* C$ Q( |. c/ r9 m N S% u
root@ceph01:~/my-cluster# ceph osd tree
8 b# F" I: h6 hID WEIGHT TYPE NAME UP/DOWN REWEIGHT PRIMARY-AFFINITY 1 X& o, S( E0 Z) |' W
-1 0.29279 root default ; E; u- ~: ?% q8 A" R/ j
-2 0.14639 host ceph01
. E' k+ |. ]2 W 0 0.14639 osd.0 up 1.00000 1.00000
2 x+ Y) z9 b: j9 q8 R3 L h* e-3 0.14639 host ceph02
+ c2 F& f' L+ N5 f 1 0.14639 osd.1 up 1.00000 1.00000 0 N3 s4 O1 }: w$ Y; G
root@ceph01:~/my-cluster#
V8 N' x7 m. q' l$ E2 J复制代码3 q o {/ v( [- e+ Q- N
复制代码
5 R5 S- i% b+ f" a8 Q3 E8 rroot@ceph01:~/my-cluster# ceph -s2 _* m; j' M3 h/ Y8 j. l
cluster ecacda71-af9f-46f9-a2a3-a35c9e51db9e* C" d ^2 Z; S/ o
health HEALTH_OK- i3 `3 q4 |; P7 N) N; z
monmap e1: 1 mons at {ceph01=10.111.131.125:6789/0}& z# ~ n% k r( v: B) n7 i
election epoch 14, quorum 0 ceph01
$ V/ T: [4 i' h Y osdmap e150: 2 osds: 2 up, 2 in
, V& j0 D( N9 p# s; ~ flags sortbitwise,require_jewel_osds# Q _8 d- `% z8 H0 `, z& ]6 Z/ p. v0 f
pgmap v9284: 64 pgs, 1 pools, 17 bytes data, 3 objects
6 o0 S0 m" i: \ S" M) c 10310 MB used, 289 GB / 299 GB avail
( n2 V1 \8 N8 y' C6 c 64 active+clean2 ^: k. a, n6 @2 t: l1 n
复制代码
/ } o! T, m5 `1 g& B0 T J E5 k- P只有为 HEALTH_OK 才算是正常的。$ I8 ~* P! _. U5 V. v5 p
$ G' U2 e+ j0 l, w- b4 d7 P$ j+ u) z" M) w6 Q$ V# V$ j0 T
|
|