Teguh Triharto Learning Center: .::: Install PAF (PostgreSQL Automatic Failover ) on centos7 ( base on pacemaker, pcs, corosys ):::.

A. Install PAF

1. Prepare Node

Node01

10.10.10.241

Node02

10.10.10.242

Ip Virtual

10.10.10.243

2. Install pacemaker, corosys, pacemaker & configuration

insttal pcsd

https://teguhth.blogspot.com/2018/08/how-to-install-and-configuration-pcs.html

or you can install pacemaker default by PAF

# yum install -y pacemaker resource-agents pcs fence-agents-all fence-agents-virsh

# systemctl enable corosync

# systemctl enable pacemaker

# systemctl enable pcsd

...

exlude 6. Add ClusterIP/FloatingIP/VirtualIP on Server01 for this step

folowing blog until ok

# pcs status

3. install replication streaming & test

https://teguhth.blogspot.com/2021/08/how-to-setting-streaminng-replication.html

4. Node fencing configuration

pcs cluster cib cluster1.xml

pcs -f cluster1.xml stonith create fence_vm_server1 fence_virsh \

pcmk_host_check="static-list" pcmk_host_list="node01" \

ipaddr="10.10.10.241" login="root" passwd="root" port="node01" \

identity_file="/root/.ssh/id_rsa" meta provides=infecing

pcs -f cluster1.xml stonith create fence_vm_server2 fence_virsh \

pcmk_host_check="static-list" pcmk_host_list="node02" \

ipaddr="10.10.10.242" login="root" passwd="root" port="node02" \

identity_file="/root/.ssh/id_rsa" meta provides=infecing

pcs -f cluster1.xml constraint location fence_vm_server1 avoids node01=INFINITY

pcs -f cluster1.xml constraint location fence_vm_server2 avoids node02=INFINITY

pcs cluster cib-push cluster1.xml

log

[root@node01 data]# pcs cluster cib cluster1.xml

[root@node01 data]# pcs -f cluster1.xml stonith create fence_vm_server1 fence_virsh \

> pcmk_host_check="static-list" pcmk_host_list="node01" \

> ipaddr="10.10.10.241" login="root" passwd="root" port="node01" \

> identity_file="/root/.ssh/id_rsa" meta provides=infecing

[root@node01 data]# pcs -f cluster1.xml stonith create fence_vm_server2 fence_virsh \

> pcmk_host_check="static-list" pcmk_host_list="node02" \

> ipaddr="10.10.10.242" login="root" passwd="root" port="node02" \

> identity_file="/root/.ssh/id_rsa" meta provides=infecing

[root@node01 data]# pcs -f cluster1.xml constraint location fence_vm_server1 avoids node01=INFINITY

Warning: Validation for node existence in the cluster will be skipped

[root@node01 data]# pcs -f cluster1.xml constraint location fence_vm_server2 avoids node02=INFINITY

Warning: Validation for node existence in the cluster will be skipped

[root@node01 data]# pcs cluster cib-push cluster1.xml

CIB updated

[root@node01 data]#

5. check pcs status for fence

[root@node01 data]# pcs status

Cluster name: mycluster

Stack: corosync

Current DC: node01 (version 1.1.23-1.el7_9.1-9acf116022) - partition with quorum

Last updated: Thu Sep 16 12:12:41 2021

Last change: Thu Sep 16 12:12:14 2021 by root via cibadmin on node01

2 nodes configured

2 resource instances configured

Online: [ node01 node02 ]

Full list of resources:

fence_vm_server1 (stonith:fence_virsh): Started node02

fence_vm_server2 (stonith:fence_virsh): Started node01

Daemon Status:

corosync: active/enabled

pacemaker: active/enabled

pcsd: active/enabled

[root@node01 data]#

6. Cluster resources creation

# pgsqld

pcs -f cluster1.xml resource create pgsqld ocf:heartbeat:pgsqlms \

bindir=/usr/pgsql-13/bin \

pgdata=/var/lib/pgsql/13/data \

op start timeout=60s \

op stop timeout=60s \

op promote timeout=30s \

op demote timeout=120s \

op monitor interval=15s timeout=10s role="Master" \

op monitor interval=16s timeout=10s role="Slave" \

op notify timeout=60s

# pgsql-ha

pcs -f cluster1.xml resource master pgsql-ha pgsqld notify=true

pcs -f cluster1.xml resource create pgsql-pri-ip ocf:heartbeat:IPaddr2 ip=10.10.10.243 nic=eth0 cidr_netmask=24 op monitor interval=10s

pcs --force -f cluster1.xml constraint colocation add pgsql-pri-ip with master pgsql-ha INFINITY;

pcs --force -f cluster1.xml constraint order promote pgsql-ha then start pgsql-pri-ip symmetrical=false kind=Mandatory;

pcs --force -f cluster1.xml constraint order demote pgsql-ha then stop pgsql-pri-ip symmetrical=false kind=Mandatory;

pcs cluster cib-push scope=configuration cluster1.xml;

pcs status

log

[root@node01 data]# pcs -f cluster1.xml resource create pgsqld ocf:heartbeat:pgsqlms \

> bindir=/usr/pgsql-13/bin \

> pgdata=/var/lib/pgsql/13/data \

> op start timeout=60s \

> op stop timeout=60s \

> op promote timeout=30s \

> op demote timeout=120s \

> op monitor interval=15s timeout=10s role="Master" \

> op monitor interval=16s timeout=10s role="Slave" \

> op notify timeout=60s

[root@node01 data]#

[root@node01 data]# pcs -f cluster1.xml resource master pgsql-ha pgsqld notify=true

[root@node01 data]#

[root@node01 data]# pcs -f cluster1.xml resource create pgsql-pri-ip ocf:heartbeat:IPaddr2 ip=10.10.10.243 nic=eth0 cidr_netmask=24 op monitor interval=10s

[root@node01 data]#

[root@node01 data]# pcs -f cluster1.xml resource create pgsql-pri-ip ocf:heartbeat:IPaddr2 ip=10.10.10.243 nic=eth0 cidr_netmask=24 op monitor interval=10s

[root@node01 data]#

[root@node01 data]# pcs --force -f cluster1.xml constraint colocation add pgsql-pri-ip with master pgsql-ha INFINITY;

[root@node01 data]#

[root@node01 data]# pcs --force -f cluster1.xml constraint order promote pgsql-ha then start pgsql-pri-ip symmetrical=false kind=Mandatory;

Adding pgsql-ha pgsql-pri-ip (kind: Mandatory) (Options: first-action=promote then-action=start symmetrical=false)

[root@node01 data]#

[root@node01 data]# pcs --force -f cluster1.xml constraint order demote pgsql-ha then stop pgsql-pri-ip symmetrical=false kind=Mandatory;

Adding pgsql-ha pgsql-pri-ip (kind: Mandatory) (Options: first-action=demote then-action=stop symmetrical=false)

[root@node01 data]#

[root@node01 data]# pcs cluster cib-push scope=configuration cluster1.xml;

CIB updated

[root@node01 data]#

7. Check the cluster status:

[root@node01 data]# pcs status

Cluster name: mycluster

Stack: corosync

Current DC: node01 (version 1.1.23-1.el7_9.1-9acf116022) - partition with quorum

Last updated: Thu Sep 16 12:18:37 2021

Last change: Thu Sep 16 12:18:27 2021 by root via cibadmin on node01

2 nodes configured

5 resource instances configured (2 BLOCKED from further action due to failure)

Online: [ node01 node02 ]

Full list of resources:

fence_vm_server1 (stonith:fence_virsh): Started node02

fence_vm_server2 (stonith:fence_virsh): Started node01

Master/Slave Set: pgsql-ha [pgsqld]

pgsqld (ocf::heartbeat:pgsqlms): FAILED node02 (blocked)

pgsqld (ocf::heartbeat:pgsqlms): FAILED node01 (blocked)

pgsql-pri-ip (ocf::heartbeat:IPaddr2): Stopped

Failed Resource Actions:

* pgsqld_stop_0 on node02 'invalid parameter' (2): call=36, status=complete, exitreason='Parameter "primary_conninfo" MUST contain 'application_name=node02'. It is currently set to 'user=replicate password=admin chann',

last-rc-change='Thu Sep 16 12:18:32 2021', queued=0ms, exec=357ms

* pgsqld_stop_0 on node01 'invalid parameter' (2): call=36, status=complete, exitreason='Parameter "primary_conninfo" MUST contain 'application_name=node01'. It is currently set to 'host=10.10.10.243 application_name=',

last-rc-change='Thu Sep 16 12:18:32 2021', queued=0ms, exec=403ms

Daemon Status:

corosync: active/enabled

pacemaker: active/enabled

pcsd: active/enabled

[root@node01 data]#

8. Stil error, next will try trouble shoot. cause i m focus my new job

B. Delete fence & pcs cluster

1. Chek pcs status with error

[root@node01 ~]# pcs status

Cluster name: mycluster

Stack: corosync

Current DC: node01 (version 1.1.23-1.el7_9.1-9acf116022) - partition with quorum

Last updated: Thu Sep 16 11:52:33 2021

Last change: Mon Sep 13 16:38:20 2021 by root via cibadmin on node01

2 nodes configured

6 resource instances configured

Online: [ node01 node02 ]

Full list of resources:

fence_vm_server1 (stonith:fence_virsh): Started node01

fence_vm_server2 (stonith:fence_virsh): Started node02

Master/Slave Set: pgsql-ha [pgsqld]

pgsqld (ocf::heartbeat:pgsqlms): FAILED node02

pgsqld (ocf::heartbeat:pgsqlms): FAILED node01

pgsql-pri-ip (ocf::heartbeat:IPaddr2): Stopped

ClusterIP (ocf::heartbeat:IPaddr2): Stopped

Failed Resource Actions:

* pgsqld_monitor_0 on node02 'invalid parameter' (2): call=16, status=complete, exitreason='Parameter "primary_conninfo" MUST contain 'application_name=node02'. It is currently set to 'user=replicate password=admin chann',

last-rc-change='Thu Sep 16 11:52:22 2021', queued=2ms, exec=7249ms

* pgsqld_monitor_0 on node01 'invalid parameter' (2): call=16, status=complete, exitreason='Parameter "primary_conninfo" MUST contain 'application_name=node01'. It is currently set to 'host=10.10.10.243 application_name=',

last-rc-change='Thu Sep 16 11:52:12 2021', queued=1ms, exec=5753ms

Daemon Status:

corosync: active/enabled

pacemaker: active/enabled

pcsd: active/enabled

[root@node01 ~]#

2. Delete Resource

[root@node01 ~]# pcs resource delete ClusterIP

Attempting to stop: ClusterIP... Stopped

[root@node01 ~]#

[root@node01 ~]# pcs resource delete pgsql-pri-ip

Removing Constraint - colocation-pgsql-pri-ip-pgsql-ha-INFINITY

Removing Constraint - order-pgsql-ha-pgsql-pri-ip-Mandatory

Removing Constraint - order-pgsql-ha-pgsql-pri-ip-Mandatory-1

Deleting Resource - pgsql-pri-ip

[root@node01 ~]# pcs resource delete pgsql-ha

Deleting Resource - pgsqld

[root@node01 ~]# pcs resource delete pgsqld

Error: Resource 'pgsqld' does not exist.

[root@node01 ~]#

pcs cluster stop node01

3. restart pcs for update

[root@node01 ~]# pcs cluster stop --all

node02: Stopping Cluster (pacemaker)...

node01: Stopping Cluster (pacemaker)...

node02: Stopping Cluster (corosync)...

node01: Stopping Cluster (corosync)...

[root@node01 ~]#

[root@node01 ~]# pcs cluster start --all

node01: Starting Cluster (corosync)...

node02: Starting Cluster (corosync)...

node02: Starting Cluster (pacemaker)...

node01: Starting Cluster (pacemaker)...

[root@node01 ~]# pcs status

Cluster name: mycluster

Stack: corosync

Current DC: NONE

Last updated: Thu Sep 16 12:06:00 2021

Last change: Thu Sep 16 12:03:57 2021 by root via cibadmin on node01

2 nodes configured

2 resource instances configured

OFFLINE: [ node01 node02 ]

Full list of resources:

fence_vm_server1 (stonith:fence_virsh): Stopped

fence_vm_server2 (stonith:fence_virsh): Stopped

Daemon Status:

corosync: active/enabled

pacemaker: active/enabled

pcsd: active/enabled

[root@node01 ~]#

4. Delete Stonith

[root@node01 ~]# pcs stonith show

fence_vm_server1 (stonith:fence_virsh): Started node01

fence_vm_server2 (stonith:fence_virsh): Started node02

[root@node01 ~]#

[root@node01 ~]# pcs stonith show fence_vm_server1

Resource: fence_vm_server1 (class=stonith type=fence_virsh)

Attributes: identity_file=/root/.ssh/id_rsa ipaddr=10.10.10.241 login=root passwd=root pcmk_host_check=static-list pcmk_host_list=node01 port=node01

Meta Attrs: provides=infecing

Operations: monitor interval=60s (fence_vm_server1-monitor-interval-60s)

[root@node01 ~]# pcs stonith show fence_vm_server2

Resource: fence_vm_server2 (class=stonith type=fence_virsh)

Attributes: identity_file=/root/.ssh/id_rsa ipaddr=10.10.10.242 login=root passwd=root pcmk_host_check=static-list pcmk_host_list=node02 port=node02

Meta Attrs: provides=infecing

Operations: monitor interval=60s (fence_vm_server2-monitor-interval-60s)

[root@node01 ~]#

[root@node01 ~]# pcs stonith delete fence_vm_server1

Attempting to stop: fence_vm_server1... Stopped

[root@node01 ~]# pcs stonith delete fence_vm_server2

Attempting to stop: fence_vm_server2... Stopped

[root@node01 ~]# pcs stonith show

NO stonith devices configured

[root@node01 ~]#

5. Check pcs status

[root@node01 ~]# pcs status

Cluster name: mycluster

Stack: corosync

Current DC: node01 (version 1.1.23-1.el7_9.1-9acf116022) - partition with quorum

Last updated: Thu Sep 16 12:27:00 2021

Last change: Thu Sep 16 12:25:13 2021 by root via cibadmin on node01

2 nodes configured

0 resource instances configured

Online: [ node01 node02 ]

No resources

Daemon Status:

corosync: active/enabled

pacemaker: active/enabled

pcsd: inactive/enabled

[root@node01 ~]#

https://pgstef.github.io/2018/02/07/introduction_to_postgresql_automatic_failover.html

https://www.thegeekdiary.com/configuring-cluster-fencing-agents-in-a-pacemaker-cluster/

https://varunsaklani.wordpress.com/2019/09/25/fence-device-in-pacemaker-is-stopped-and-unable-to-start/

https://wiki.lustre.org/Lustre_Server_Fault_Isolation_with_Pacemaker_Node_Fencing

Teguh Triharto Learning Center

Blog List

Thursday, September 16, 2021

.::: Install PAF (PostgreSQL Automatic Failover ) on centos7 ( base on pacemaker, pcs, corosys ):::.

No comments:

Post a Comment

Popular Posts