[gpfsug-discuss] CCR cluster down for the count?

Edward Wahl ewahl at osc.edu
Wed Sep 20 20:07:39 BST 2017


So who was the ccrmaster before? 
What is/was the quorum config?  (tiebreaker disks?) 

what does 'mmccr check' say?


Have you set DEBUG=1 and tried mmstartup to see if it teases out any more info
from the error?


Ed


On Wed, 20 Sep 2017 16:27:48 +0000
"Buterbaugh, Kevin L" <Kevin.Buterbaugh at Vanderbilt.Edu> wrote:

> Hi Ed,
> 
> Thanks for the suggestion … that’s basically what I had done yesterday after
> Googling and getting a hit or two on the IBM DeveloperWorks site.  I’m
> including some output below which seems to show that I’ve got everything set
> up but it’s still not working.
> 
> Am I missing something?  We don’t use CCR on our production cluster (and this
> experience doesn’t make me eager to do so!), so I’m not that familiar with
> it...
> 
> Kevin
> 
> /var/mmfs/gen
> root at testnsd2# mmdsh -F /tmp/cluster.hostnames "ps -ef | grep mmccr | grep -v
> grep" | sort testdellnode1:  root      2583     1  0 May30 ?
> 00:10:33 /usr/lpp/mmfs/bin/mmksh /usr/lpp/mmfs/bin/mmccrmonitor 15
> testdellnode1:  root      6694  2583  0 11:19 ?
> 00:00:00 /usr/lpp/mmfs/bin/mmksh /usr/lpp/mmfs/bin/mmccrmonitor 15
> testgateway:  root      2023  5828  0 11:19 ?
> 00:00:00 /usr/lpp/mmfs/bin/mmksh /usr/lpp/mmfs/bin/mmccrmonitor 15
> testgateway:  root      5828     1  0 Sep18 ?
> 00:00:19 /usr/lpp/mmfs/bin/mmksh /usr/lpp/mmfs/bin/mmccrmonitor 15 testnsd1:
> root     19356  4628  0 11:19 tty1
> 00:00:00 /usr/lpp/mmfs/bin/mmksh /usr/lpp/mmfs/bin/mmccrmonitor 15 testnsd1:
> root      4628     1  0 Sep19 tty1
> 00:00:04 /usr/lpp/mmfs/bin/mmksh /usr/lpp/mmfs/bin/mmccrmonitor 15 testnsd2:
> root     22149  2983  0 11:16 ?
> 00:00:00 /usr/lpp/mmfs/bin/mmksh /usr/lpp/mmfs/bin/mmccrmonitor 15 testnsd2:
> root      2983     1  0 Sep18 ?
> 00:00:27 /usr/lpp/mmfs/bin/mmksh /usr/lpp/mmfs/bin/mmccrmonitor 15 testnsd3:
> root     15685  6557  0 11:19 ?
> 00:00:00 /usr/lpp/mmfs/bin/mmksh /usr/lpp/mmfs/bin/mmccrmonitor 15 testnsd3:
> root      6557     1  0 Sep19 ?
> 00:00:04 /usr/lpp/mmfs/bin/mmksh /usr/lpp/mmfs/bin/mmccrmonitor 15
> testsched:  root     29424  6512  0 11:19 ?
> 00:00:00 /usr/lpp/mmfs/bin/mmksh /usr/lpp/mmfs/bin/mmccrmonitor 15
> testsched:  root      6512     1  0 Sep18 ?
> 00:00:20 /usr/lpp/mmfs/bin/mmksh /usr/lpp/mmfs/bin/mmccrmonitor
> 15 /var/mmfs/gen root at testnsd2# mmstartup -a get file failed: Not enough CCR
> quorum nodes available (err 809) gpfsClusterInit: Unexpected error from ccr
> fget mmsdrfs.  Return code: 158 mmstartup: Command failed. Examine previous
> error messages to determine cause. /var/mmfs/gen root at testnsd2# mmdsh
> -F /tmp/cluster.hostnames "ls -l /var/mmfs/ccr" | sort testdellnode1:
> drwxr-xr-x 2 root root 4096 Mar  3  2017 cached testdellnode1:  drwxr-xr-x 2
> root root 4096 Nov 10  2016 committed testdellnode1:  -rw-r--r-- 1 root
> root   99 Nov 10  2016 ccr.nodes testdellnode1:  total 12 testgateway:
> drwxr-xr-x. 2 root root 4096 Jun 29  2016 committed testgateway:  drwxr-xr-x.
> 2 root root 4096 Mar  3  2017 cached testgateway:  -rw-r--r--. 1 root root
> 99 Jun 29  2016 ccr.nodes testgateway:  total 12 testnsd1:  drwxr-xr-x 2 root
> root  6 Sep 19 15:38 cached testnsd1:  drwxr-xr-x 2 root root  6 Sep 19 15:38
> committed testnsd1:  -rw-r--r-- 1 root root  0 Sep 19 15:39 ccr.disks
> testnsd1:  -rw-r--r-- 1 root root  4 Sep 19 15:38 ccr.noauth testnsd1:
> -rw-r--r-- 1 root root 99 Sep 19 15:39 ccr.nodes testnsd1:  total 8
> testnsd2:  drwxr-xr-x 2 root root   22 Mar  3  2017 cached testnsd2:
> drwxr-xr-x 2 root root 4096 Sep 18 11:49 committed testnsd2:  -rw------- 1
> root root 4096 Sep 18 11:50 ccr.paxos.1 testnsd2:  -rw------- 1 root root
> 4096 Sep 18 11:50 ccr.paxos.2 testnsd2:  -rw-r--r-- 1 root root    0 Jun 29
> 2016 ccr.disks testnsd2:  -rw-r--r-- 1 root root   99 Jun 29  2016 ccr.nodes
> testnsd2:  total 16 testnsd3:  drwxr-xr-x 2 root root  6 Sep 19 15:41 cached
> testnsd3:  drwxr-xr-x 2 root root  6 Sep 19 15:41 committed testnsd3:
> -rw-r--r-- 1 root root  0 Jun 29  2016 ccr.disks testnsd3:  -rw-r--r-- 1 root
> root  4 Sep 19 15:41 ccr.noauth testnsd3:  -rw-r--r-- 1 root root 99 Jun 29
> 2016 ccr.nodes testnsd3:  total 8 testsched:  drwxr-xr-x. 2 root root 4096
> Jun 29  2016 committed testsched:  drwxr-xr-x. 2 root root 4096 Mar  3  2017
> cached testsched:  -rw-r--r--. 1 root root   99 Jun 29  2016 ccr.nodes
> testsched:  total 12 /var/mmfs/gen root at testnsd2# more ../ccr/ccr.nodes
> 3,0,10.0.6.215,,testnsd3.vampire
> 1,0,10.0.6.213,,testnsd1.vampire
> 2,0,10.0.6.214,,testnsd2.vampire
> /var/mmfs/gen
> root at testnsd2# mmdsh -F /tmp/cluster.hostnames "ls -l /var/mmfs/gen/mmsdrfs"
> testnsd1:  -rw-r--r-- 1 root root 20360 Sep 19 15:21 /var/mmfs/gen/mmsdrfs
> testnsd3:  -rw-r--r-- 1 root root 20360 Sep 19 15:34 /var/mmfs/gen/mmsdrfs
> testnsd2:  -rw-r--r-- 1 root root 20360 Aug 25 17:34 /var/mmfs/gen/mmsdrfs
> testdellnode1:  -rw-r--r-- 1 root root 20360 Aug 25
> 17:43 /var/mmfs/gen/mmsdrfs testgateway:  -rw-r--r--. 1 root root 20360 Aug
> 25 17:43 /var/mmfs/gen/mmsdrfs testsched:  -rw-r--r--. 1 root root 20360 Aug
> 25 17:43 /var/mmfs/gen/mmsdrfs /var/mmfs/gen
> root at testnsd2# mmdsh -F /tmp/cluster.hostnames "md5sum /var/mmfs/gen/mmsdrfs"
> testnsd1:  7120c79d9d767466c7629763abb7f730  /var/mmfs/gen/mmsdrfs
> testnsd3:  7120c79d9d767466c7629763abb7f730  /var/mmfs/gen/mmsdrfs
> testnsd2:  7120c79d9d767466c7629763abb7f730  /var/mmfs/gen/mmsdrfs
> testdellnode1:  7120c79d9d767466c7629763abb7f730  /var/mmfs/gen/mmsdrfs
> testgateway:  7120c79d9d767466c7629763abb7f730  /var/mmfs/gen/mmsdrfs
> testsched:  7120c79d9d767466c7629763abb7f730  /var/mmfs/gen/mmsdrfs
> /var/mmfs/gen
> root at testnsd2# mmdsh -F /tmp/cluster.hostnames
> "md5sum /var/mmfs/ssl/stage/genkeyData1" testnsd3:
> ee6d345a87202a9f9d613e4862c92811  /var/mmfs/ssl/stage/genkeyData1 testnsd1:
> ee6d345a87202a9f9d613e4862c92811  /var/mmfs/ssl/stage/genkeyData1 testnsd2:
> ee6d345a87202a9f9d613e4862c92811  /var/mmfs/ssl/stage/genkeyData1
> testdellnode1:
> ee6d345a87202a9f9d613e4862c92811  /var/mmfs/ssl/stage/genkeyData1
> testgateway:
> ee6d345a87202a9f9d613e4862c92811  /var/mmfs/ssl/stage/genkeyData1 testsched:
> ee6d345a87202a9f9d613e4862c92811  /var/mmfs/ssl/stage/genkeyData1 /var/mmfs/gen
> root at testnsd2#
> 
> On Sep 20, 2017, at 10:48 AM, Edward Wahl
> <ewahl at osc.edu<mailto:ewahl at osc.edu>> wrote:
> 
> I've run into this before.  We didn't use to use CCR.  And restoring nodes for
> us is a major pain in the rear as we only allow one-way root SSH, so we have a
> number of useful little scripts to work around problems like this.
> 
> Assuming that you have all the necessary files copied to the correct
> places, you can manually kick off CCR.
> 
> I think my script does something like:
> 
> (copy the encryption key info)
> 
> scp  /var/mmfs/ccr/ccr.nodes <node>:/var/mmfs/ccr/
> 
> scp /var/mmfs/gen/mmsdrfs <node>:/var/mmfs/gen/
> 
> scp /var/mmfs/ssl/stage/genkeyData1  <node>:/var/mmfs/ssl/stage/
> 
> <node>:/usr/lpp/mmfs/bin/mmcommon startCcrMonitor
> 
> you should then see like 2 copies of it running under mmksh.
> 
> Ed
> 
> 
> On Wed, 20 Sep 2017 13:55:28 +0000
> "Buterbaugh, Kevin L"
> <Kevin.Buterbaugh at Vanderbilt.Edu<mailto:Kevin.Buterbaugh at Vanderbilt.Edu>>
> wrote:
> 
> Hi All,
> 
> testnsd1 and testnsd3 both had hardware issues (power supply and internal HD
> respectively).  Given that they were 12 year old boxes, we decided to replace
> them with other boxes that are a mere 7 years old … keep in mind that this is
> a test cluster.
> 
> Disabling CCR does not work, even with the undocumented “—force” option:
> 
> /var/mmfs/gen
> root at testnsd2# mmchcluster --ccr-disable -p testnsd2 -s testnsd1 --force
> mmchcluster: Unable to obtain the GPFS configuration file lock.
> mmchcluster: GPFS was unable to obtain a lock from node testnsd1.vampire.
> mmchcluster: Processing continues without lock protection.
> The authenticity of host 'testnsd3.vampire (10.0.6.215)' can't be established.
> ECDSA key fingerprint is SHA256:Ky1pkjsC/kvt4RA8PJuEh/W3vcxCJZplr2m1XHr+UwI.
> ECDSA key fingerprint is MD5:55:59:a0:2a:6e:a1:00:58:85:3d:ac:86:0e:cd:2a:8a.
> Are you sure you want to continue connecting (yes/no)? The authenticity of
> host 'testnsd1.vampire (10.0.6.213)' can't be established. ECDSA key
> fingerprint is SHA256:WPiTtyuyzhuv+lRRpgDjLuHpyHyk/W3+c5N9SabWvnE. ECDSA key
> fingerprint is MD5:26:26:2a:bf:e4:cb:1d:a8:27:35:96:ef:b5:96:e0:29. Are you
> sure you want to continue connecting (yes/no)? The authenticity of host
> 'vmp609.vampire (10.0.21.9)' can't be established. ECDSA key fingerprint is
> SHA256:/gX6eSp/shsRboVFcUFcNCtGSfbBIWQZ/CWjA6gb17Q. ECDSA key fingerprint is
> MD5:ca:4d:58:8c:91:28:25:7b:5b:b1:0d:a3:72:a3:00:bb. Are you sure you want to
> continue connecting (yes/no)? The authenticity of host 'vmp608.vampire
> (10.0.21.8)' can't be established. ECDSA key fingerprint is
> SHA256:tvtNWN9b7/Qknb/Am8x7FzyMngi6R3f5SHBqATNtLzw. ECDSA key fingerprint is
> MD5:fc:4e:87:fb:09:82:cd:67:b0:7d:7f:c7:4b:83:b9:6c. Are you sure you want to
> continue connecting (yes/no)? The authenticity of host 'vmp612.vampire
> (10.0.21.12)' can't be established. ECDSA key fingerprint is
> SHA256:zKXqPt8rIMZWSAYavKEuaAVIm31OGVovoWVU+dBTRPM. ECDSA key fingerprint is
> MD5:72:4d:fb:22:4e:b3:0e:04:37:be:16:74:ae:ea:05:6c. Are you sure you want to
> continue connecting (yes/no)?
> root at vmp610.vampire<mailto:root at vmp610.vampire><mailto:root at vmp610.vampire>'s
> password: testnsd3.vampire:  Host key verification failed. mmdsh:
> testnsd3.vampire remote shell process had return code 255. testnsd1.vampire:
> Host key verification failed. mmdsh: testnsd1.vampire remote shell process
> had return code 255. vmp609.vampire:  Host key verification failed. mmdsh:
> vmp609.vampire remote shell process had return code 255. vmp608.vampire:
> Host key verification failed. mmdsh: vmp608.vampire remote shell process had
> return code 255. vmp612.vampire:  Host key verification failed. mmdsh:
> vmp612.vampire remote shell process had return code 255.
> 
> root at vmp610.vampire<mailto:root at vmp610.vampire><mailto:root at vmp610.vampire>'s
> password: vmp610.vampire: Permission denied, please try again.
> 
> root at vmp610.vampire<mailto:root at vmp610.vampire><mailto:root at vmp610.vampire>'s
> password: vmp610.vampire: Permission denied, please try again.
> 
> vmp610.vampire:  Permission denied
> (publickey,gssapi-keyex,gssapi-with-mic,password). mmdsh: vmp610.vampire
> remote shell process had return code 255.
> 
> Verifying GPFS is stopped on all nodes ...
> The authenticity of host 'testnsd3.vampire (10.0.6.215)' can't be established.
> ECDSA key fingerprint is SHA256:Ky1pkjsC/kvt4RA8PJuEh/W3vcxCJZplr2m1XHr+UwI.
> ECDSA key fingerprint is MD5:55:59:a0:2a:6e:a1:00:58:85:3d:ac:86:0e:cd:2a:8a.
> Are you sure you want to continue connecting (yes/no)? The authenticity of
> host 'vmp612.vampire (10.0.21.12)' can't be established. ECDSA key
> fingerprint is SHA256:zKXqPt8rIMZWSAYavKEuaAVIm31OGVovoWVU+dBTRPM. ECDSA key
> fingerprint is MD5:72:4d:fb:22:4e:b3:0e:04:37:be:16:74:ae:ea:05:6c. Are you
> sure you want to continue connecting (yes/no)? The authenticity of host
> 'vmp608.vampire (10.0.21.8)' can't be established. ECDSA key fingerprint is
> SHA256:tvtNWN9b7/Qknb/Am8x7FzyMngi6R3f5SHBqATNtLzw. ECDSA key fingerprint is
> MD5:fc:4e:87:fb:09:82:cd:67:b0:7d:7f:c7:4b:83:b9:6c. Are you sure you want to
> continue connecting (yes/no)? The authenticity of host 'vmp609.vampire
> (10.0.21.9)' can't be established. ECDSA key fingerprint is
> SHA256:/gX6eSp/shsRboVFcUFcNCtGSfbBIWQZ/CWjA6gb17Q. ECDSA key fingerprint is
> MD5:ca:4d:58:8c:91:28:25:7b:5b:b1:0d:a3:72:a3:00:bb. Are you sure you want to
> continue connecting (yes/no)? The authenticity of host 'testnsd1.vampire
> (10.0.6.213)' can't be established. ECDSA key fingerprint is
> SHA256:WPiTtyuyzhuv+lRRpgDjLuHpyHyk/W3+c5N9SabWvnE. ECDSA key fingerprint is
> MD5:26:26:2a:bf:e4:cb:1d:a8:27:35:96:ef:b5:96:e0:29. Are you sure you want to
> continue connecting (yes/no)?
> root at vmp610.vampire<mailto:root at vmp610.vampire><mailto:root at vmp610.vampire>'s
> password:
> root at vmp610.vampire<mailto:root at vmp610.vampire><mailto:root at vmp610.vampire>'s
> password:
> root at vmp610.vampire<mailto:root at vmp610.vampire><mailto:root at vmp610.vampire>'s
> password:
> 
> testnsd3.vampire:  Host key verification failed.
> mmdsh: testnsd3.vampire remote shell process had return code 255.
> vmp612.vampire:  Host key verification failed.
> mmdsh: vmp612.vampire remote shell process had return code 255.
> vmp608.vampire:  Host key verification failed.
> mmdsh: vmp608.vampire remote shell process had return code 255.
> vmp609.vampire:  Host key verification failed.
> mmdsh: vmp609.vampire remote shell process had return code 255.
> testnsd1.vampire:  Host key verification failed.
> mmdsh: testnsd1.vampire remote shell process had return code 255.
> vmp610.vampire:  Permission denied, please try again.
> vmp610.vampire:  Permission denied, please try again.
> vmp610.vampire:  Permission denied
> (publickey,gssapi-keyex,gssapi-with-mic,password). mmdsh: vmp610.vampire
> remote shell process had return code 255. mmchcluster: Command failed.
> Examine previous error messages to determine cause. /var/mmfs/gen
> root at testnsd2#
> 
> I believe that part of the problem may be that there are 4 client nodes that
> were removed from the cluster without removing them from the cluster (done by
> another SysAdmin who was in a hurry to repurpose those machines).  They’re up
> and pingable but not reachable by GPFS anymore, which I’m pretty sure is
> making things worse.
> 
> Nor does Loic’s suggestion of running mmcommon work (but thanks for the
> suggestion!) … actually the mmcommon part worked, but a subsequent attempt to
> start the cluster up failed:
> 
> /var/mmfs/gen
> root at testnsd2# mmstartup -a
> get file failed: Not enough CCR quorum nodes available (err 809)
> gpfsClusterInit: Unexpected error from ccr fget mmsdrfs.  Return code: 158
> mmstartup: Command failed. Examine previous error messages to determine cause.
> /var/mmfs/gen
> root at testnsd2#
> 
> Thanks.
> 
> Kevin
> 
> On Sep 19, 2017, at 10:07 PM, IBM Spectrum Scale
> <scale at us.ibm.com<mailto:scale at us.ibm.com><mailto:scale at us.ibm.com>> wrote:
> 
> 
> Hi Kevin,
> 
> Let's me try to understand the problem you have. What's the meaning of node
> died here. Are you mean that there are some hardware/OS issue which cannot be
> fixed and OS cannot be up anymore?
> 
> I agree with Bob that you can have a try to disable CCR temporally, restore
> cluster configuration and enable it again.
> 
> Such as:
> 
> 1. Login to a node which has proper GPFS config, e.g NodeA
> 2. Shutdown daemon in all client cluster.
> 3. mmchcluster --ccr-disable -p NodeA
> 4. mmsdrrestore -a -p NodeA
> 5. mmauth genkey propagate -N testnsd1, testnsd3
> 6. mmchcluster --ccr-enable
> 
> Regards, The Spectrum Scale (GPFS) team
> 
> ------------------------------------------------------------------------------------------------------------------
> If you feel that your question can benefit other users of Spectrum Scale
> (GPFS), then please post it to the public IBM developerWroks Forum at
> https://na01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.ibm.com%2Fdeveloperworks%2Fcommunity%2Fforums%2Fhtml%2Fforum%3Fid%3D11111111-0000-0000-0000-000000000479&data=02%7C01%7CKevin.Buterbaugh%40Vanderbilt.Edu%7C745cfeaac7264124bb8c08d5003f162a%7Cba5a7f39e3be4ab3b45067fa80faecad%7C0%7C0%7C636415193316350738&sdata=8OL9COHsb4M%2BZOyWta92acdO8K1Ez8HJfHbrCdDsmRs%3D&reserved=0<https://na01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.ibm.com%2Fdeveloperworks%2Fcommunity%2Fforums%2Fhtml%2Fforum%3Fid%3D11111111-0000-0000-0000-000000000479&data=02%7C01%7CKevin.Buterbaugh%40vanderbilt.edu%7C494f0469ec084568b39608d4ffd4b8c2%7Cba5a7f39e3be4ab3b45067fa80faecad%7C0%7C0%7C636414736486816768&sdata=rDOjWbVnVsp5M75VorQgDtZhxMrgvwIgV%2BReJgt5ZUs%3D&reserved=0>.
> 
> If your query concerns a potential software error in Spectrum Scale (GPFS)
> and you have an IBM software maintenance contract please contact
> 1-800-237-5511 in the United States or your local IBM Service Center in other
> countries.
> 
> The forum is informally monitored as time permits and should not be used for
> priority messages to the Spectrum Scale (GPFS) team.
> 
> <graycol.gif>"Oesterlin, Robert" ---09/20/2017 07:39:55 AM---OK – I’ve run
> across this before, and it’s because of a bug (as I recall) having to do with
> CCR and
> 
> From: "Oesterlin, Robert"
> <Robert.Oesterlin at nuance.com<mailto:Robert.Oesterlin at nuance.com><mailto:Robert.Oesterlin at nuance.com>>
> To: gpfsug main discussion list
> <gpfsug-discuss at spectrumscale.org<mailto:gpfsug-discuss at spectrumscale.org><mailto:gpfsug-discuss at spectrumscale.org>>
> Date: 09/20/2017 07:39 AM Subject: Re: [gpfsug-discuss] CCR cluster down for
> the count? Sent by:
> gpfsug-discuss-bounces at spectrumscale.org<mailto:gpfsug-discuss-bounces at spectrumscale.org><mailto:gpfsug-discuss-bounces at spectrumscale.org>
> 
> ________________________________
> 
> 
> 
> OK – I’ve run across this before, and it’s because of a bug (as I recall)
> having to do with CCR and quorum. What I think you can do is set the cluster
> to non-ccr (mmchcluster –ccr-disable) with all the nodes down, bring it back
> up and then re-enable ccr.
> 
> I’ll see if I can find this in one of the recent 4.2 release nodes.
> 
> 
> Bob Oesterlin
> Sr Principal Storage Engineer, Nuance
> 
> 
> From:
> <gpfsug-discuss-bounces at spectrumscale.org<mailto:gpfsug-discuss-bounces at spectrumscale.org><mailto:gpfsug-discuss-bounces at spectrumscale.org>>
> on behalf of "Buterbaugh, Kevin L"
> <Kevin.Buterbaugh at Vanderbilt.Edu<mailto:Kevin.Buterbaugh at Vanderbilt.Edu><mailto:Kevin.Buterbaugh at Vanderbilt.Edu>>
> Reply-To: gpfsug main discussion list
> <gpfsug-discuss at spectrumscale.org<mailto:gpfsug-discuss at spectrumscale.org><mailto:gpfsug-discuss at spectrumscale.org>>
> Date: Tuesday, September 19, 2017 at 4:03 PM To: gpfsug main discussion list
> <gpfsug-discuss at spectrumscale.org<mailto:gpfsug-discuss at spectrumscale.org><mailto:gpfsug-discuss at spectrumscale.org>>
> Subject: [EXTERNAL] [gpfsug-discuss] CCR cluster down for the count?
> 
> Hi All,
> 
> We have a small test cluster that is CCR enabled. It only had/has 3 NSD
> servers (testnsd1, 2, and 3) and maybe 3-6 clients. testnsd3 died a while
> back. I did nothing about it at the time because it was due to be life-cycled
> as soon as I finished a couple of higher priority projects.
> 
> Yesterday, testnsd1 also died, which took the whole cluster down. So now
> resolving this has become higher priority… ;-)
> 
> I took two other boxes and set them up as testnsd1 and 3, respectively. I’ve
> done a “mmsdrrestore -p testnsd2 -R /usr/bin/scp” on both of them. I’ve also
> done a "mmccr setup -F” and copied the ccr.disks and ccr.nodes files from
> testnsd2 to them. And I’ve copied /var/mmfs/gen/mmsdrfs from testnsd2 to
> testnsd1 and 3. In case it’s not obvious from the above, networking is fine …
> ssh without a password between those 3 boxes is fine.
> 
> However, when I try to startup GPFS … or run any GPFS command I get:
> 
> /root
> root at testnsd2# mmstartup -a
> get file failed: Not enough CCR quorum nodes available (err 809)
> gpfsClusterInit: Unexpected error from ccr fget mmsdrfs. Return code: 158
> mmstartup: Command failed. Examine previous error messages to determine cause.
> /root
> root at testnsd2#
> 
> I’ve got to run to a meeting right now, so I hope I’m not leaving out any
> crucial details here … does anyone have an idea what I need to do? Thanks…
> 
>> Kevin Buterbaugh - Senior System Administrator
> Vanderbilt University - Advanced Computing Center for Research and Education
> Kevin.Buterbaugh at vanderbilt.edu<mailto:Kevin.Buterbaugh at vanderbilt.edu><mailto:Kevin.Buterbaugh at vanderbilt.edu>
> - (615)875-9633
> 
> 
> _______________________________________________
> gpfsug-discuss mailing list
> gpfsug-discuss at
> spectrumscale.org<http://spectrumscale.org><https://na01.safelinks.protection.outlook.com/?url=http%3A%2F%2Fspectrumscale.org&data=02%7C01%7CKevin.Buterbaugh%40Vanderbilt.Edu%7C745cfeaac7264124bb8c08d5003f162a%7Cba5a7f39e3be4ab3b45067fa80faecad%7C0%7C0%7C636415193316350738&sdata=sVk0NNvXp4b4MnO8gUXBx0pEnAClHIGz9%2BSocg64TSQ%3D&reserved=0>
> https://na01.safelinks.protection.outlook.com/?url=https%3A%2F%2Furldefense.proofpoint.com%2Fv2%2Furl%3Fu%3Dhttp-3A__gpfsug.org_mailman_listinfo_gpfsug-2Ddiscuss%26d%3DDwICAg%26c%3Djf_iaSHvJObTbx-siA1ZOg%26r%3DIbxtjdkPAM2Sbon4Lbbi4w%26m%3DmBSa534LB4C2zN59ZsJSlginQqfcrutinpAPYNDqU_Y%26s%3DYJEapknqzE2d9kwZzZuu6gEW0DzBoM-o94pXGEeCfuI%26e&data=02%7C01%7CKevin.Buterbaugh%40Vanderbilt.Edu%7C745cfeaac7264124bb8c08d5003f162a%7Cba5a7f39e3be4ab3b45067fa80faecad%7C0%7C0%7C636415193316350738&sdata=oQ4u%2BdyyYLY7HzaOqRPEGjUVhi7AQF%2BvbvnWA4bhuXE%3D&reserved=0=<https://na01.safelinks.protection.outlook.com/?url=https%3A%2F%2Furldefense.proofpoint.com%2Fv2%2Furl%3Fu%3Dhttp-3A__gpfsug.org_mailman_listinfo_gpfsug-2Ddiscuss%26d%3DDwICAg%26c%3Djf_iaSHvJObTbx-siA1ZOg%26r%3DIbxtjdkPAM2Sbon4Lbbi4w%26m%3DmBSa534LB4C2zN59ZsJSlginQqfcrutinpAPYNDqU_Y%26s%3DYJEapknqzE2d9kwZzZuu6gEW0DzBoM-o94pXGEeCfuI%26e%3D&data=02%7C01%7CKevin.Buterbaugh%40vanderbilt.edu%7C494f0469ec084568b39608d4ffd4b8c2%7Cba5a7f39e3be4ab3b45067fa80faecad%7C0%7C0%7C636414736486816768&sdata=66K3H2yHjRwd%2F56tamS2itwN6%2Fg3fnVkLAl9D0M%2BWSQ%3D&reserved=0>
> 
> 
> 
> _______________________________________________
> gpfsug-discuss mailing list
> gpfsug-discuss at
> spectrumscale.org<https://na01.safelinks.protection.outlook.com/?url=http%3A%2F%2Fspectrumscale.org&data=02%7C01%7CKevin.Buterbaugh%40Vanderbilt.Edu%7C745cfeaac7264124bb8c08d5003f162a%7Cba5a7f39e3be4ab3b45067fa80faecad%7C0%7C0%7C636415193316350738&sdata=sVk0NNvXp4b4MnO8gUXBx0pEnAClHIGz9%2BSocg64TSQ%3D&reserved=0>
> https://na01.safelinks.protection.outlook.com/?url=http%3A%2F%2Fgpfsug.org%2Fmailman%2Flistinfo%2Fgpfsug-discuss&data=02%7C01%7CKevin.Buterbaugh%40vanderbilt.edu%7C494f0469ec084568b39608d4ffd4b8c2%7Cba5a7f39e3be4ab3b45067fa80faecad%7C0%7C0%7C636414736486816768&sdata=kBvEL7Kp2JMGuLIL4NX3UV7h3emaayQSbHr8O1F2CXc%3D&reserved=0
> 
> 
> 
> 
> --
> 
> Ed Wahl
> Ohio Supercomputer Center
> 614-292-9302
> 
> 
> 
>> Kevin Buterbaugh - Senior System Administrator
> Vanderbilt University - Advanced Computing Center for Research and Education
> Kevin.Buterbaugh at vanderbilt.edu<mailto:Kevin.Buterbaugh at vanderbilt.edu> -
> (615)875-9633
> 
> 
> 



-- 

Ed Wahl
Ohio Supercomputer Center
614-292-9302



More information about the gpfsug-discuss mailing list