Rook Ceph安装文档

需提前安装cert-manager

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
yum install -y git
cd ~/kubernetes-1.24.0/
git clone --single-branch --branch v1.9.4 https://github.com/rook/rook.git
cd rook/deploy/examples
kubectl create -f crds.yaml -f common.yaml -f operator.yaml
kubectl -n rook-ceph get pod

#  cluster.yaml 文件最上面添加
kind: ConfigMap
apiVersion: v1
metadata:
  name: rook-config-override
  namespace: rook-ceph # namespace:cluster
data:
  config: |
    [global]
    bluefs_buffered_io = false
---

kubectl create -f cluster.yaml


kubectl create -f toolbox.yaml
kubectl -n rook-ceph exec -it deploy/rook-ceph-tools -- bash
ceph status
ceph osd status
ceph osd tree
ceph df
rados df
ceph osd pool ls detail

ceph osd pool get myfs-metadata size
ceph osd pool set myfs-metadata size 3

ceph osd pool get myfs-replicated size
ceph osd pool set myfs-replicated size 3

ceph osd pool get my-store.rgw.log size
ceph osd pool set my-store.rgw.log size 3

ceph osd pool get my-store.rgw.control size
ceph osd pool set my-store.rgw.control size 3

ceph osd pool get .rgw.root size
ceph osd pool set .rgw.root size 3

ceph osd pool get my-store.rgw.meta size
ceph osd pool set my-store.rgw.meta size 3

ceph osd pool get my-store.rgw.buckets.index size
ceph osd pool set my-store.rgw.buckets.index size 3

ceph osd pool get my-store.rgw.buckets.non-ec size
ceph osd pool set my-store.rgw.buckets.non-ec size 3

ceph osd pool get my-store.rgw.buckets.data size
ceph osd pool set my-store.rgw.buckets.data size 3

# Create the filesystem
kubectl create -f filesystem.yaml
kubectl -n rook-ceph get pod -l app=rook-ceph-mds

kubectl -n rook-ceph exec -it deploy/rook-ceph-tools -- bash
ceph status

kubectl create -f csi/cephfs/storageclass.yaml
kubectl get sc

# Ceph Dashboard
#kubectl create -f dashboard-external-https.yaml
kubectl -n rook-ceph get service
kubectl -n rook-ceph get secret rook-ceph-dashboard-password -o jsonpath="{['data']['password']}" | base64 --decode && echo

# Object Storage
cp object.yaml test-object.yaml
kubectl create -f object.yaml
kubectl -n rook-ceph get pod -l app=rook-ceph-rgw
kubectl create -f storageclass-bucket-delete.yaml
kubectl create -f object-bucket-claim-delete.yaml
#config-map, secret, OBC will part of default if no specific name space mentioned
export AWS_HOST=$(kubectl -n default get cm ceph-delete-bucket -o jsonpath='{.data.BUCKET_HOST}')
# rook-ceph-rgw-my-store.rook-ceph.svc
export PORT=$(kubectl -n default get cm ceph-delete-bucket -o jsonpath='{.data.BUCKET_PORT}')
# 80
export BUCKET_NAME=$(kubectl -n default get cm ceph-delete-bucket -o jsonpath='{.data.BUCKET_NAME}')
# ceph-bkt-test-bucket-name
export AWS_ACCESS_KEY_ID=$(kubectl -n default get secret ceph-delete-bucket -o jsonpath='{.data.AWS_ACCESS_KEY_ID}' | base64 --decode)
# TestID
export AWS_SECRET_ACCESS_KEY=$(kubectl -n default get secret ceph-delete-bucket -o jsonpath='{.data.AWS_SECRET_ACCESS_KEY}' | base64 --decode)
# TestKEY

kubectl -n rook-ceph exec -it deploy/rook-ceph-tools -- bash
export AWS_HOST=rook-ceph-rgw-my-store.rook-ceph.svc
export PORT=80
export AWS_ACCESS_KEY_ID=TestID
export AWS_SECRET_ACCESS_KEY=TestKEY
export BUCKET_NAME=ceph-bkt-test-bucket-name

echo "Hello Rook" > /tmp/rookObj
s5cmd --endpoint-url http://$AWS_HOST:$PORT cp /tmp/rookObj s3://$BUCKET_NAME

s5cmd --endpoint-url http://$AWS_HOST:$PORT cp s3://$BUCKET_NAME/rookObj /tmp/rookObj-download
cat /tmp/rookObj-download


kubectl -n rook-ceph get service rook-ceph-rgw-my-store

kubectl create -f rgw-external.yaml

kubectl -n rook-ceph get service rook-ceph-rgw-my-store rook-ceph-rgw-my-store-external

# Create the object store user
cp -a object-user.yaml test-object-user.yaml
kubectl create -f test-object-user.yaml

# To confirm the object store user is configured, describe the secret
kubectl -n rook-ceph describe secret rook-ceph-object-user-my-store-test

kubectl -n rook-ceph get secret rook-ceph-object-user-my-store-test -o jsonpath='{.data.AccessKey}' | base64 --decode
# TEST_ACCESS_KEY_ID
kubectl -n rook-ceph get secret rook-ceph-object-user-my-store-test -o jsonpath='{.data.SecretKey}' | base64 --decode
# TEST_AWS_SECRET_ACCESS_KEY
export AWS_HOST=www.test.com
export PORT=30059
export AWS_ACCESS_KEY_ID=TEST_ACCESS_KEY_ID
export AWS_SECRET_ACCESS_KEY=TEST_AWS_SECRET_ACCESS_KEY
#export BUCKET_NAME=ceph-bkt-1f7e9e68-b1ea-4ffd-93c6-3910e6ff4065


kubectl -n rook-ceph exec -it deploy/rook-ceph-tools -- bash
export AWS_HOST=rook-ceph-rgw-my-store.rook-ceph.svc
export PORT=80
export AWS_ACCESS_KEY_ID=TEST_ACCESS_KEY_ID
export AWS_SECRET_ACCESS_KEY=TEST_AWS_SECRET_ACCESS_KEY
export BUCKET_NAME=test

s5cmd --endpoint-url http://$AWS_HOST:$PORT ls s3://$BUCKET_NAME

rgw-ingress

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
kubectl apply -f - <<EOF
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
  name: rook-ceph-rgw-my-store
  namespace: rook-ceph
  annotations:
    cert-manager.io/cluster-issuer: letsencrypt-prod
    kubernetes.io/tls-acme: "true"
    nginx.ingress.kubernetes.io/ssl-redirect: "true"
    nginx.ingress.kubernetes.io/proxy-body-size: 100m
spec:
  ingressClassName: nginx
  tls:
    - hosts:
        - rgw.example.com
      secretName: rgw.example.com
  rules:
    - host: rgw.example.com
      http:
        paths:
          - path: /
            pathType: Prefix
            backend:
              service:
                name: rook-ceph-rgw-my-store
                port:
                  name: http
EOF

配置 s3cmd 客户端

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
export AWS_HOST=https://rgw.example.com
export AWS_ENDPOINT=rgw.example.com:443
export AWS_ACCESS_KEY_ID=TEST_AWS_ACCESS_KEY_ID
export AWS_SECRET_ACCESS_KEY=TEST_AWS_SECRET_ACCESS_KEY

s3cmd --configure \
        --access_key=${AWS_ACCESS_KEY_ID} \
        --secret_key=${AWS_SECRET_ACCESS_KEY} \
        --region=US \
        --host=rgw.example.com:443 \
        --host-bucket='%(bucket)s.rgw.example.com:443' \
        --no-ssl
# 一路默认值和yes

测试 s3cmd 客户端文件上传下载

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# 列出 bucket
s3cmd ls

# 创建 bucket
s3cmd mb --no-ssl --host=${AWS_HOST} --region=":default-placement" --host-bucket= s3://test

# 上传文件
echo "Hello Rook" > test.txt
s3cmd put test.txt --no-ssl --host=${AWS_HOST} --host-bucket= s3://test

# 下载文件
s3cmd get s3://test/test.txt test.txt-download --no-ssl --host=${AWS_HOST} --host-bucket= 

# 查看 bucket 中的文件
s3cmd ls s3://test --no-ssl --host=${AWS_HOST} --host-bucket=

# 删除 bucket 中的文件
s3cmd del s3://test/test.txt --no-ssl --host=${AWS_HOST} --host-bucket=


s3cmd mv s3://test/QQ20220628-2.jpg s3://test/testupload/QQ20220628-2.jpg --no-ssl --host=${AWS_HOST} --host-bucket= 
s3cmd mv s3://test/QQ20220628-3.jpg s3://test/testupload/QQ20220628-3.jpg --no-ssl --host=${AWS_HOST} --host-bucket= 
s3cmd mv s3://test/QQ20220628-4.jpg s3://test/testupload/QQ20220628-4.jpg --no-ssl --host=${AWS_HOST} --host-bucket= 

升级rook-ceph v10

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# 0. This guide is for upgrading from Rook v1.9.x to Rook v1.10.x.
## rook:v1.9.4
## ceph:v16.2.9
## cephcsi:v3.6.1

# 1. Health Verification
## 1.1 Pods all Running
### In a healthy Rook cluster, all pods in the Rook namespace should be in the Running (or Completed) state and have few, if any, pod restarts.
export ROOK_OPERATOR_NAMESPACE=rook-ceph
export ROOK_CLUSTER_NAMESPACE=rook-ceph
kubectl -n $ROOK_CLUSTER_NAMESPACE get pods

## 1.2 Status Output
### The Rook toolbox contains the Ceph tools that can give you status details of the cluster with the ceph status command. Let's look at an output sample and review some of the details:
TOOLS_POD=$(kubectl -n $ROOK_CLUSTER_NAMESPACE get pod -l "app=rook-ceph-tools" -o jsonpath='{.items[*].metadata.name}')
kubectl -n $ROOK_CLUSTER_NAMESPACE exec -it $TOOLS_POD -- ceph status

  # cluster:
  #   id:     a3f4d647-9538-4aff-9fd1-b845873c3fe9
  #   health: HEALTH_OK

  # services:
  #   mon: 3 daemons, quorum b,c,a
  #   mgr: a(active)
  #   mds: myfs-1/1/1 up  {0=myfs-a=up:active}, 1 up:standby-replay
  #   osd: 6 osds: 6 up, 6 in
  #   rgw: 1 daemon active

  # data:
  #   pools:   9 pools, 900 pgs
  #   objects: 67  objects, 11 KiB
  #   usage:   6.1 GiB used, 54 GiB / 60 GiB avail
  #   pgs:     900 active+clean

  # io:
  #   client:   7.4 KiB/s rd, 681 B/s wr, 11 op/s rd, 4 op/s wr
  #   recovery: 164 B/s, 1 objects/s

### In the output above, note the following indications that the cluster is in a healthy state:
### Cluster health: The overall cluster status is HEALTH_OK and there are no warning or error status messages displayed.
### Monitors (mon): All of the monitors are included in the quorum list.
### Manager (mgr): The Ceph manager is in the active state.
### OSDs (osd): All OSDs are up and in.
### Placement groups (pgs): All PGs are in the active+clean state.
### (If applicable) Ceph filesystem metadata server (mds): all MDSes are active for all filesystems
### (If applicable) Ceph object store RADOS gateways (rgw): all daemons are active

## 1.3 Container Versions
### The container version running in a specific pod in the Rook cluster can be verified in its pod spec output. For example, for the monitor pod mon-b we can verify the container version it is running with the below commands:
POD_NAME=$(kubectl -n $ROOK_CLUSTER_NAMESPACE get pod -o custom-columns=name:.metadata.name --no-headers | grep rook-ceph-mon-b)
kubectl -n $ROOK_CLUSTER_NAMESPACE get pod ${POD_NAME} -o jsonpath='{.spec.containers[0].image}'

### The status and container versions for all Rook pods can be collected all at once with the following commands:
kubectl -n $ROOK_OPERATOR_NAMESPACE get pod -o jsonpath='{range .items[*]}{.metadata.name}{"\n\t"}{.status.phase}{"\t\t"}{.spec.containers[0].image}{"\t"}{.spec.initContainers[0]}{"\n"}{end}' && \
kubectl -n $ROOK_CLUSTER_NAMESPACE get pod -o jsonpath='{range .items[*]}{.metadata.name}{"\n\t"}{.status.phase}{"\t\t"}{.spec.containers[0].image}{"\t"}{.spec.initContainers[0].image}{"\n"}{end}'

### The rook-version label exists on Ceph resources. For various resource controllers, a summary of the resource controllers can be gained with the commands below. These will report the requested, updated, and currently available replicas for various Rook resources in addition to the version of Rook for resources managed by Rook. Note that the operator and toolbox deployments do not have a rook-version label set.
kubectl -n $ROOK_CLUSTER_NAMESPACE get deployments -o jsonpath='{range .items[*]}{.metadata.name}{"  \treq/upd/avl: "}{.spec.replicas}{"/"}{.status.updatedReplicas}{"/"}{.status.readyReplicas}{"  \trook-version="}{.metadata.labels.rook-version}{"\n"}{end}'
kubectl -n $ROOK_CLUSTER_NAMESPACE get jobs -o jsonpath='{range .items[*]}{.metadata.name}{"  \tsucceeded: "}{.status.succeeded}{"      \trook-version="}{.metadata.labels.rook-version}{"\n"}{end}'

## 1.4 Rook Volume Health
### Any pod that is using a Rook volume should also remain healthy:
### The pod should be in the Running state with few, if any, restarts
### There should be no errors in its logs
### The pod should still be able to read and write to the attached Rook volume.


# 2. Rook Upgrades
## 2.1 Breaking changes in v1.10
### Support for Ceph Octopus (15.2.x) was removed. If you are running v15 you must upgrade to Ceph Pacific (v16) or Quincy (v17) before upgrading to Rook v1.10
### The minimum supported version of Ceph-CSI is v3.6.0. You must update to at least this version of Ceph-CSI before or at the same time you update the Rook operator image to v1.10
### Before upgrading to K8s 1.25, ensure that you are running at least Rook v1.9.10, or v1.10.x. If you upgrade to K8s 1.25 before upgrading to v1.9.10 or newer, the Helm chart may be blocked from upgrading to newer versions of Rook. See #10826 for a possible workaround.

## 2.2 Patch Release Upgrades
### Unless otherwise noted due to extenuating requirements, upgrades from one patch release of Rook to another are as simple as updating the common resources and the image of the Rook operator. For example, when Rook v1.10.12 is released, the process of updating from v1.10.0 is as simple as running the following:

git clone --single-branch --depth=1 --branch v1.10.12 https://github.com/rook/rook.git
cd rook/deploy/examples

kubectl apply -f common.yaml -f crds.yaml
# kubectl -n $ROOK_OPERATOR_NAMESPACE set image deploy/rook-ceph-operator rook-ceph-operator=rook/ceph:v1.10.12
# 修改 operator.yaml
  ROOK_CSI_REGISTRAR_IMAGE: "registry.cn-hangzhou.aliyuncs.com/google_containers/csi-node-driver-registrar:v2.7.0"
  ROOK_CSI_RESIZER_IMAGE: "registry.cn-hangzhou.aliyuncs.com/google_containers/csi-resizer:v1.7.0"
  ROOK_CSI_PROVISIONER_IMAGE: "registry.cn-hangzhou.aliyuncs.com/google_containers/csi-provisioner:v3.4.0"
  ROOK_CSI_SNAPSHOTTER_IMAGE: "registry.cn-hangzhou.aliyuncs.com/google_containers/csi-snapshotter:v6.2.1"
  ROOK_CSI_ATTACHER_IMAGE: "registry.cn-hangzhou.aliyuncs.com/google_containers/csi-attacher:v4.1.0"

kubectl apply -f operator.yaml

kubectl apply -f toolbox.yaml

## 2.3 Wait for the upgrade to complete
watch --exec kubectl -n $ROOK_CLUSTER_NAMESPACE get deployments -l rook_cluster=$ROOK_CLUSTER_NAMESPACE -o jsonpath='{range .items[*]}{.metadata.name}{"  \treq/upd/avl: "}{.spec.replicas}{"/"}{.status.updatedReplicas}{"/"}{.status.readyReplicas}{"  \trook-version="}{.metadata.labels.rook-version}{"\n"}{end}'

kubectl -n $ROOK_CLUSTER_NAMESPACE get deployment -l rook_cluster=$ROOK_CLUSTER_NAMESPACE -o jsonpath='{range .items[*]}{"rook-version="}{.metadata.labels.rook-version}{"\n"}{end}' | sort | uniq

# This cluster is not yet finished:
#   rook-version=v1.9.13
#   rook-version=v1.10.12
# This cluster is finished:
#   rook-version=v1.10.12

## 2.4 Verify the updated cluster
TOOLS_POD=$(kubectl -n $ROOK_CLUSTER_NAMESPACE get pod -l "app=rook-ceph-tools" -o jsonpath='{.items[*].metadata.name}')
kubectl -n $ROOK_CLUSTER_NAMESPACE exec -it $TOOLS_POD -- ceph status

# 3. Ceph Upgrades
## 3.1 Supported Versions
### Rook v1.10 supports the following Ceph versions:
### Ceph Quincy v17.2.0 or newer
### Ceph Pacific v16.2.0 or newer

## 3.2 Update the Ceph daemons
### The upgrade will be automated by the Rook operator after you update the desired Ceph image in the cluster CRD (spec.cephVersion.image).
ROOK_CLUSTER_NAMESPACE=rook-ceph
NEW_CEPH_IMAGE='quay.io/ceph/ceph:v17.2.5-20221017'
kubectl -n $ROOK_CLUSTER_NAMESPACE patch CephCluster $ROOK_CLUSTER_NAMESPACE --type=merge -p "{\"spec\": {\"cephVersion\": {\"image\": \"$NEW_CEPH_IMAGE\"}}}"

## 3.3 Wait for the pod updates
### As with upgrading Rook, you must now wait for the upgrade to complete. Status can be determined in a similar way to the Rook upgrade as well.
watch --exec kubectl -n $ROOK_CLUSTER_NAMESPACE get deployments -l rook_cluster=$ROOK_CLUSTER_NAMESPACE -o jsonpath='{range .items[*]}{.metadata.name}{"  \treq/upd/avl: "}{.spec.replicas}{"/"}{.status.updatedReplicas}{"/"}{.status.readyReplicas}{"  \tceph-version="}{.metadata.labels.ceph-version}{"\n"}{end}'
### Confirm the upgrade is completed when the versions are all on the desired Ceph version.
kubectl -n $ROOK_CLUSTER_NAMESPACE get deployment -l rook_cluster=$ROOK_CLUSTER_NAMESPACE -o jsonpath='{range .items[*]}{"ceph-version="}{.metadata.labels.ceph-version}{"\n"}{end}' | sort | uniq
This cluster is not yet finished:
    ceph-version=15.2.13-0
    ceph-version=v17.2.5-0
This cluster is finished:
    ceph-version=v17.2.5-0

## 3.4 Verify cluster health
TOOLS_POD=$(kubectl -n $ROOK_CLUSTER_NAMESPACE get pod -l "app=rook-ceph-tools" -o jsonpath='{.items[*].metadata.name}')
kubectl -n $ROOK_CLUSTER_NAMESPACE exec -it $TOOLS_POD -- ceph status

升级rook-ceph v11

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# 0. This guide is for upgrading from Rook v1.10.x to Rook v1.11.x.
## rook:v1.10.12
## ceph:v17.2.5
## cephcsi:v3.7.2

# 1. Health Verification
## 1.1 Pods all Running
### In a healthy Rook cluster, all pods in the Rook namespace should be in the Running (or Completed) state and have few, if any, pod restarts.
export ROOK_OPERATOR_NAMESPACE=rook-ceph
export ROOK_CLUSTER_NAMESPACE=rook-ceph
kubectl -n $ROOK_CLUSTER_NAMESPACE get pods


## 1.2 Status Output
### The Rook toolbox contains the Ceph tools that can give you status details of the cluster with the ceph status command. Let's look at an output sample and review some of the details:
TOOLS_POD=$(kubectl -n $ROOK_CLUSTER_NAMESPACE get pod -l "app=rook-ceph-tools" -o jsonpath='{.items[*].metadata.name}')
kubectl -n $ROOK_CLUSTER_NAMESPACE exec -it $TOOLS_POD -- ceph status

  # cluster:
  #   id:     d5f5a898-86f4-4345-8926-d5489d32745d
  #   health: HEALTH_OK
 
  # services:
  #   mon: 3 daemons, quorum a,b,c (age 3h)
  #   mgr: b(active, since 3h), standbys: a
  #   mds: 1/1 daemons up, 1 hot standby
  #   osd: 9 osds: 9 up (since 3h), 9 in (since 5d)
 
  # data:
  #   volumes: 1/1 healthy
  #   pools:   3 pools, 65 pgs
  #   objects: 6.10k objects, 18 GiB
  #   usage:   57 GiB used, 2.6 TiB / 2.6 TiB avail
  #   pgs:     65 active+clean
 
  # io:
  #   client:   852 B/s rd, 1 op/s rd, 0 op/s wr

### In the output above, note the following indications that the cluster is in a healthy state:
### Cluster health: The overall cluster status is HEALTH_OK and there are no warning or error status messages displayed.
### Monitors (mon): All of the monitors are included in the quorum list.
### Manager (mgr): The Ceph manager is in the active state.
### OSDs (osd): All OSDs are up and in.
### Placement groups (pgs): All PGs are in the active+clean state.
### (If applicable) Ceph filesystem metadata server (mds): all MDSes are active for all filesystems
### (If applicable) Ceph object store RADOS gateways (rgw): all daemons are active

## 1.3 Container Versions
### The container version running in a specific pod in the Rook cluster can be verified in its pod spec output. For example, for the monitor pod mon-b we can verify the container version it is running with the below commands:
POD_NAME=$(kubectl -n $ROOK_CLUSTER_NAMESPACE get pod -o custom-columns=name:.metadata.name --no-headers | grep rook-ceph-mon-b)
kubectl -n $ROOK_CLUSTER_NAMESPACE get pod ${POD_NAME} -o jsonpath='{.spec.containers[0].image}'

### The status and container versions for all Rook pods can be collected all at once with the following commands:
kubectl -n $ROOK_OPERATOR_NAMESPACE get pod -o jsonpath='{range .items[*]}{.metadata.name}{"\n\t"}{.status.phase}{"\t\t"}{.spec.containers[0].image}{"\t"}{.spec.initContainers[0]}{"\n"}{end}' && \
kubectl -n $ROOK_CLUSTER_NAMESPACE get pod -o jsonpath='{range .items[*]}{.metadata.name}{"\n\t"}{.status.phase}{"\t\t"}{.spec.containers[0].image}{"\t"}{.spec.initContainers[0].image}{"\n"}{end}'

### The rook-version label exists on Ceph resources. For various resource controllers, a summary of the resource controllers can be gained with the commands below. These will report the requested, updated, and currently available replicas for various Rook resources in addition to the version of Rook for resources managed by Rook. Note that the operator and toolbox deployments do not have a rook-version label set.
kubectl -n $ROOK_CLUSTER_NAMESPACE get deployments -o jsonpath='{range .items[*]}{.metadata.name}{"  \treq/upd/avl: "}{.spec.replicas}{"/"}{.status.updatedReplicas}{"/"}{.status.readyReplicas}{"  \trook-version="}{.metadata.labels.rook-version}{"\n"}{end}'
kubectl -n $ROOK_CLUSTER_NAMESPACE get jobs -o jsonpath='{range .items[*]}{.metadata.name}{"  \tsucceeded: "}{.status.succeeded}{"      \trook-version="}{.metadata.labels.rook-version}{"\n"}{end}'

## 1.4 Rook Volume Health
### Any pod that is using a Rook volume should also remain healthy:
### The pod should be in the Running state with few, if any, restarts
### There should be no errors in its logs
### The pod should still be able to read and write to the attached Rook volume.


# 2. Rook Upgrades
## 2.1 Breaking changes in v1.11
### The minimum supported version of Kubernetes is now v1.21.0

## 2.2 Patch Release Upgrades
### Unless otherwise noted due to extenuating requirements, upgrades from one patch release of Rook to another are as simple as updating the common resources and the image of the Rook operator. For example, when Rook v1.11.1 is released, the process of updating from v1.11.0 is as simple as running the following:

git clone --single-branch --depth=1 --branch v1.11.0 https://github.com/rook/rook.git
cd rook/deploy/examples

kubectl replace -f common.yaml -f crds.yaml
kubectl -n $ROOK_OPERATOR_NAMESPACE set image deploy/rook-ceph-operator rook-ceph-operator=rook/ceph:v1.11.0
# 修改 operator.yaml（v10到v11无需修改）
  # ROOK_CSI_RESIZER_IMAGE: "registry.cn-hangzhou.aliyuncs.com/google_containers/csi-resizer:v1.7.0"
  # ROOK_CSI_REGISTRAR_IMAGE: "registry.cn-hangzhou.aliyuncs.com/google_containers/csi-node-driver-registrar:v2.7.0"
  # ROOK_CSI_PROVISIONER_IMAGE: "registry.cn-hangzhou.aliyuncs.com/google_containers/csi-provisioner:v3.4.0"
  # ROOK_CSI_SNAPSHOTTER_IMAGE: "registry.cn-hangzhou.aliyuncs.com/google_containers/csi-snapshotter:v6.2.1"
  # ROOK_CSI_ATTACHER_IMAGE: "registry.cn-hangzhou.aliyuncs.com/google_containers/csi-attacher:v4.1.0"
# kubectl apply -f operator.yaml

kubectl apply -f toolbox.yaml

## 2.3 Wait for the upgrade to complete
watch --exec kubectl -n $ROOK_CLUSTER_NAMESPACE get deployments -l rook_cluster=$ROOK_CLUSTER_NAMESPACE -o jsonpath='{range .items[*]}{.metadata.name}{"  \treq/upd/avl: "}{.spec.replicas}{"/"}{.status.updatedReplicas}{"/"}{.status.readyReplicas}{"  \trook-version="}{.metadata.labels.rook-version}{"\n"}{end}'

kubectl -n $ROOK_CLUSTER_NAMESPACE get deployment -l rook_cluster=$ROOK_CLUSTER_NAMESPACE -o jsonpath='{range .items[*]}{"rook-version="}{.metadata.labels.rook-version}{"\n"}{end}' | sort | uniq
# This cluster is not yet finished:
#   rook-version=v1.10.12
#   rook-version=v1.11.0
# This cluster is finished:
#   rook-version=v1.11.0

## 2.4 Verify the updated cluster
TOOLS_POD=$(kubectl -n $ROOK_CLUSTER_NAMESPACE get pod -l "app=rook-ceph-tools" -o jsonpath='{.items[*].metadata.name}')
kubectl -n $ROOK_CLUSTER_NAMESPACE exec -it $TOOLS_POD -- ceph status

# 3. Ceph Upgrades (无需升级)
# ## 3.1 Supported Versions
# ### Rook v1.10 supports the following Ceph versions:
# ### Ceph Quincy v17.2.0 or newer
# ### Ceph Pacific v16.2.0 or newer

# ## 3.2 Update the Ceph daemons
# ### The upgrade will be automated by the Rook operator after you update the desired Ceph image in the cluster CRD (spec.cephVersion.image).
# ROOK_CLUSTER_NAMESPACE=rook-ceph
# NEW_CEPH_IMAGE='quay.io/ceph/ceph:v17.2.5-20221017'
# kubectl -n $ROOK_CLUSTER_NAMESPACE patch CephCluster $ROOK_CLUSTER_NAMESPACE --type=merge -p "{\"spec\": {\"cephVersion\": {\"image\": \"$NEW_CEPH_IMAGE\"}}}"

# ## 3.3 Wait for the pod updates
# ### As with upgrading Rook, you must now wait for the upgrade to complete. Status can be determined in a similar way to the Rook upgrade as well.
# watch --exec kubectl -n $ROOK_CLUSTER_NAMESPACE get deployments -l rook_cluster=$ROOK_CLUSTER_NAMESPACE -o jsonpath='{range .items[*]}{.metadata.name}{"  \treq/upd/avl: "}{.spec.replicas}{"/"}{.status.updatedReplicas}{"/"}{.status.readyReplicas}{"  \tceph-version="}{.metadata.labels.ceph-version}{"\n"}{end}'
# ### Confirm the upgrade is completed when the versions are all on the desired Ceph version.
# kubectl -n $ROOK_CLUSTER_NAMESPACE get deployment -l rook_cluster=$ROOK_CLUSTER_NAMESPACE -o jsonpath='{range .items[*]}{"ceph-version="}{.metadata.labels.ceph-version}{"\n"}{end}' | sort | uniq
# This cluster is not yet finished:
#     ceph-version=15.2.13-0
#     ceph-version=v17.2.5-0
# This cluster is finished:
#     ceph-version=v17.2.5-0

# ## 3.4 Verify cluster health
# TOOLS_POD=$(kubectl -n $ROOK_CLUSTER_NAMESPACE get pod -l "app=rook-ceph-tools" -o jsonpath='{.items[*].metadata.name}')
# kubectl -n $ROOK_CLUSTER_NAMESPACE exec -it $TOOLS_POD -- ceph status

dashboard-ingress

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
cat <<EOF >dashboard-ingress-https.yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
  name: rook-ceph-mgr-dashboard
  namespace: rook-ceph
  annotations:
    cert-manager.io/cluster-issuer: "letsencrypt-prod"
    kubernetes.io/tls-acme: "true"
    nginx.ingress.kubernetes.io/backend-protocol: "HTTPS"
    nginx.ingress.kubernetes.io/server-snippet: |
      proxy_ssl_verify off;
spec:
  ingressClassName: nginx
  tls:
    - hosts:
        - ceph.example.com
      secretName: ceph.example.com
  rules:
    - host: ceph.example.com
      http:
        paths:
          - path: /
            pathType: Prefix
            backend:
              service:
                name: rook-ceph-mgr-dashboard
                port:
                  name: https-dashboard
EOF

kubectl apply -f dashboard-ingress-https.yaml

# kubectl create ingress rook-ceph-mgr-dashboard \
#   -n rook-ceph \
#   --class=nginx \
#   --annotation cert-manager.io/cluster-issuer=letsencrypt-prod \
#   --annotation kubernetes.io/tls-acme=true \
#   --annotation nginx.ingress.kubernetes.io/backend-protocol=HTTPS \
#   --annotation nginx.ingress.kubernetes.io/server-snippet='proxy_ssl_verify off;' \
#   --rule="ceph.example.com/=rook-ceph-mgr-dashboard:https-dashboard,tls=ceph.example.com" \
#   --dry-run=client

在线安装

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# helm repo add rook-release https://charts.rook.io/release
# helm repo update
# helm show values rook-release/rook-ceph

# helm install --dry-run --debug \
#   rook-ceph rook-release/rook-ceph \
#   --namespace rook-ceph \
#   --create-namespace 

# helm install \
#   rook-ceph rook-release/rook-ceph \
#   --namespace rook-ceph \
#   --create-namespace

# # 查看状态
# helm -n rook-ceph list
# helm -n rook-ceph status rook-ceph

# # 卸载
# #helm -n rook-ceph uninstall rook-ceph


# helm show values rook-release/rook-ceph-cluster

# helm install --dry-run --debug \
#   rook-ceph-cluster rook-release/rook-ceph-cluster \
#   --namespace rook-ceph \
#   --create-namespace \
#   --set operatorNamespace=rook-ceph \
#   --set toolbox.enabled=true

# helm install \
#   rook-ceph-cluster rook-release/rook-ceph-cluster \
#   --namespace rook-ceph \
#   --create-namespace \
#   --set operatorNamespace=rook-ceph \
#   --set toolbox.enabled=true

# # helm upgrade --dry-run --debug \
# #   rook-ceph-cluster rook-release/rook-ceph-cluster \
# #   --namespace rook-ceph \
# #   --create-namespace \
# #   --set operatorNamespace=rook-ceph \
# #   --set toolbox.enabled=true \
# #   --set cephClusterSpec.mon.allowMultiplePerNode=true

# # helm upgrade \
# #   rook-ceph-cluster rook-release/rook-ceph-cluster \
# #   --namespace rook-ceph \
# #   --create-namespace \
# #   --set operatorNamespace=rook-ceph \
# #   --set toolbox.enabled=true \
# #   --set cephClusterSpec.mon.count=3 \
# #   --set cephClusterSpec.mon.allowMultiplePerNode=true

# # 查看状态
# helm -n rook-ceph list
# helm -n rook-ceph status rook-ceph-cluster

# # 卸载
# #helm -n rook-ceph uninstall rook-ceph-cluster


# # cleanupPolicy

# kubectl delete ns rook-ceph

# kubectl -n rook-ceph get cephcluster

# kubectl get ns rook-ceph  -o json > rook-ceph-namespace.json
# kubectl proxy
# curl -k -H "Content-Type: application/json" -X PUT --data-binary @rook-ceph-namespace.json http://127.0.0.1:8001/api/v1/namespaces/rook-ceph/finalize

# rm -rf /var/lib/rook

# kubectl -n rook-ceph get cephclusters.ceph.rook.io rook-ceph -o json
# kubectl -n rook-ceph patch cephclusters.ceph.rook.io rook-ceph --type merge -p '{"metadata":{"finalizers": []}}'

# kubectl -n rook-ceph exec -it deploy/rook-ceph-tools -- ceph status
# kubectl -n rook-ceph exec -it deploy/rook-ceph-tools -- bash


# # ceph mon_status
# # ceph config set mon auth_allow_insecure_global_id_reclaim false

# kubectl -n rook-ceph get ConfigMap rook-config-override -o yaml

# kubectl -n rook-ceph get configmap rook-config-override -o yaml

# kubectl -n rook-ceph edit configmap rook-config-override

# apiVersion: v1
# kind: ConfigMap
# metadata:
#   name: rook-config-override
#   namespace: rook-ceph
# data:
#   config: |
#     [global]
#     bluefs buffered io = false

# sgdisk --zap-all /dev/sda
# blkdiscard /dev/sda
# partprobe /dev/sda
# sgdisk --zap-all /dev/sdb
# blkdiscard /dev/sdb
# partprobe /dev/sdb
# sgdisk --zap-all /dev/sdc
# blkdiscard /dev/sdc
# partprobe /dev/sdc


# test-worker-1-01
# sgdisk --zap-all /dev/sda
# blkdiscard /dev/sda
# partprobe /dev/sda
# sgdisk --zap-all /dev/sdb
# blkdiscard /dev/sdb
# partprobe /dev/sdb
# sgdisk --zap-all /dev/sdc
# blkdiscard /dev/sdc
# partprobe /dev/sdc

# test-worker-3-01
# sgdisk --zap-all /dev/sdc
# blkdiscard /dev/sdc
# dd if=/dev/zero of=/dev/sdc bs=1M count=100 oflag=direct,dsync
# partprobe /dev/sdc
# sgdisk --zap-all /dev/sdd
# blkdiscard /dev/sdd
# dd if=/dev/zero of=/dev/sdd bs=1M count=100 oflag=direct,dsync
# partprobe /dev/sdd

内存大小调整

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
# 进入osd排查
kubectl -n rook-ceph exec -it rook-ceph-osd-2-589bd6c95d-wr4p8 -c osd -- bash
unset CEPH_ARGS
ceph daemon /var/run/ceph/ceph-osd.3.asok config show | grep -i osd_memory_target

# 调整内存限制
kubectl -n rook-ceph edit cephcluster rook-ceph

  resources:
    osd:
      limits:
        memory: "16Gi"
      requests:
        memory: "16Gi"

# 查看生效情况
kubectl -n rook-ceph describe pod osd-xx

# 再次进入osd排查
kubectl -n rook-ceph exec -it rook-ceph-osd-2-589bd6c95d-wr4p8 -c osd -- bash
unset CEPH_ARGS
ceph daemon /var/run/ceph/ceph-osd.3.asok config show | grep -i osd_memory_target

清除osd节点

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
kubectl -n rook-ceph exec -it $(kubectl -n rook-ceph get pod -l "app=rook-ceph-tools" -o jsonpath='{.items[0].metadata.name}') -- bash
ceph status
ceph osd status
ceph osd tree
ceph osd out osd.1
ceph osd crush remove osd.1
ceph auth del osd.1
ceph osd rm osd.1
ceph osd status
ceph osd tree

ceph osd out osd.3
ceph osd crush remove osd.3
ceph auth del osd.3
ceph osd rm osd.3

ceph osd out osd.5
ceph osd crush remove osd.5
ceph auth del osd.5
ceph osd rm osd.5

ceph osd crush remove test-worker-1-01

kubectl -n rook-ceph get pods
kubectl -n rook-ceph scale deployment rook-ceph-operator --replicas 0
kubectl -n rook-ceph scale deployment rook-ceph-operator --replicas 1

kubectl -n rook-ceph delete deployments.apps rook-ceph-osd-1 rook-ceph-osd-3 rook-ceph-osd-5 

rm -rf /var/lib/rook
rm -rf /dev/mapper/ceph-*
rm -rf /dev/ceph-*

yum install -y gdisk
sgdisk --zap-all /dev/sda
blkdiscard /dev/sda
partprobe /dev/sda
sgdisk --zap-all /dev/sdb
blkdiscard /dev/sdb
partprobe /dev/sdb
sgdisk --zap-all /dev/sdc
blkdiscard /dev/sdc
partprobe /dev/sdc

需提前安装cert-manager#

rgw-ingress#

配置 s3cmd 客户端#

测试 s3cmd 客户端文件上传下载#

升级rook-ceph v10#

升级rook-ceph v11#

dashboard-ingress#

在线安装#

内存大小调整#

清除osd节点#

需提前安装cert-manager

rgw-ingress

配置 s3cmd 客户端

测试 s3cmd 客户端文件上传下载

升级rook-ceph v10

升级rook-ceph v11

dashboard-ingress

在线安装

内存大小调整

清除osd节点