[orientdb] Re: Observed issues with running a cluster in Windows Azure

Amir Khawaja Tue, 24 Mar 2015 14:22:14 -0700

Continuing with this thread. I ended up just deleting the database and 
recreating it and the problem went away. Not sure why it went away. 
Nevertheless, I am now using the following 
default-distributed-db-config.json:


{
    "replication": true,
    "autoDeploy": true,
    "hotAlignment": false,
    "resyncEvery": 15,
    "clusters": {
        "internal": {
            "replication": false
        },
        "index": {
            "replication": false
        },
        "*": {
            "replication": true,
            "readQuorum": 1,
            "writeQuorum": 1,
            "failureAvailableNodesLessQuorum": false,
            "readYourWrites": true,
            "partitioning": {
                "strategy": "round-robin",
                "default": 0,
                "partitions": [
                    [ "<NEW_NODE>" ]
                ]
            }
        }
    }
} 

However, I noticed that now the following warning appears in the logs on 
each cluster node:

WARNING readQuorum setting not found for cluster=[class name]_[node name] 
in distributed-config.json

Why would this warning appear? Is it something that will eventually 
compromise data integrity? Does anyone have any ideas about this? Thanks.

Amir. 


On Tuesday, March 24, 2015 at 1:13:31 PM UTC-5, Amir Khawaja wrote:
>
> Please find the contents of the distributed-config.json file below:
>
>
> {"@type":"d","@version":0,"version":58,"replication":true,"autoDeploy":true,"hotAlignment":false,"resyncEvery":15,"clusters":{"@type":"d","@version":0,"internal":{"@type":"d","@version":0,"replication":false},"index":{"@type":"d","@version":0,"replication":false},"*":{"@type":"d","@version":0,"replication":true,"readQuorum":1,"writeQuorum":1,"failureAvailableNodesLessQuorum":false,"readYourWrites":true,"servers":["odb01ue2","odb02ue2","odb01uw","odb02uw","<NEW_NODE>"]},"triggered_odb02ue2":{"@type":"d","@version":0,"servers":["odb02ue2","odb01ue2","odb01uw","odb02uw","<NEW_NODE>"]},"visitor_odb02ue2":{"@type":"d","@version":0,"servers":["odb02ue2","odb01ue2","odb01uw","odb02uw","<NEW_NODE>"]},"visitortrait_odb02ue2":{"@type":"d","@version":0,"servers":["odb02ue2","odb01ue2","odb01uw","odb02uw","<NEW_NODE>"]},"v_odb02ue2":{"@type":"d","@version":0,"servers":["odb02ue2","odb01ue2","odb01uw","odb02uw","<NEW_NODE>"]},"event_odb02ue2":{"@type":"d","@version":0,"servers":["odb02ue2","odb01ue2","odb01uw","odb02uw","<NEW_NODE>"]},"has_odb02ue2":{"@type":"d","@version":0,"servers":["odb02ue2","odb01ue2","odb01uw","odb02uw","<NEW_NODE>"]},"eventtrait_odb02ue2":{"@type":"d","@version":0,"servers":["odb02ue2","odb01ue2","odb01uw","odb02uw","<NEW_NODE>"]},"orole_odb02ue2":{"@type":"d","@version":0,"servers":["odb02ue2","odb01ue2","odb01uw","odb02uw","<NEW_NODE>"]},"license_odb02ue2":{"@type":"d","@version":0,"servers":["odb02ue2","odb01ue2","odb01uw","odb02uw","<NEW_NODE>"]},"_studio_odb02ue2":{"@type":"d","@version":0,"servers":["odb02ue2","odb01ue2","odb01uw","odb02uw","<NEW_NODE>"]},"customer_odb02ue2":{"@type":"d","@version":0,"servers":["odb02ue2","odb01ue2","odb01uw","odb02uw","<NEW_NODE>"]},"orids_odb02ue2":{"@type":"d","@version":0,"servers":["odb02ue2","odb01ue2","odb01uw","odb02uw","<NEW_NODE>"]},"oschedule_odb02ue2":{"@type":"d","@version":0,"servers":["odb02ue2","odb01ue2","odb01uw","odb02uw","<NEW_NODE>"]},"foreignidentifier_odb02ue2":{"@type":"d","@version":0,"servers":["odb02ue2","odb01ue2","odb01uw","odb02uw","<NEW_NODE>"]},"e_odb02ue2":{"@type":"d","@version":0,"servers":["odb02ue2","odb01ue2","odb01uw","odb02uw","<NEW_NODE>"]},"ouser_odb02ue2":{"@type":"d","@version":0,"servers":["odb02ue2","odb01ue2","odb01uw","odb02uw","<NEW_NODE>"]},"ofunction_odb02ue2":{"@type":"d","@version":0,"servers":["odb02ue2","odb01ue2","odb01uw","odb02uw","<NEW_NODE>"]},"belongsto_odb02ue2":{"@type":"d","@version":0,"servers":["odb02ue2","odb01ue2","odb01uw","odb02uw","<NEW_NODE>"]},"belongsto_odb01uw":{"@type":"d","@version":0,"servers":["odb01uw","odb01ue2","odb02ue2","odb02uw","<NEW_NODE>"]},"_studio_odb01uw":{"@type":"d","@version":0,"servers":["odb01uw","odb01ue2","odb02ue2","odb02uw","<NEW_NODE>"]},"orids_odb01uw":{"@type":"d","@version":0,"servers":["odb01uw","odb01ue2","odb02ue2","odb02uw","<NEW_NODE>"]},"eventtrait_odb01uw":{"@type":"d","@version":0,"servers":["odb01uw","odb01ue2","odb02ue2","odb02uw","<NEW_NODE>"]},"v_odb01uw":{"@type":"d","@version":0,"servers":["odb01uw","odb01ue2","odb02ue2","odb02uw","<NEW_NODE>"]},"visitor_odb01uw":{"@type":"d","@version":0,"servers":["odb01uw","odb01ue2","odb02ue2","odb02uw","<NEW_NODE>"]},"ouser_odb01uw":{"@type":"d","@version":0,"servers":["odb01uw","odb01ue2","odb02ue2","odb02uw","<NEW_NODE>"]},"triggered_odb01uw":{"@type":"d","@version":0,"servers":["odb01uw","odb01ue2","odb02ue2","odb02uw","<NEW_NODE>"]},"oschedule_odb01uw":{"@type":"d","@version":0,"servers":["odb01uw","odb01ue2","odb02ue2","odb02uw","<NEW_NODE>"]},"e_odb01uw":{"@type":"d","@version":0,"servers":["odb01uw","odb01ue2","odb02ue2","odb02uw","<NEW_NODE>"]},"ofunction_odb01uw":{"@type":"d","@version":0,"servers":["odb01uw","odb01ue2","odb02ue2","odb02uw","<NEW_NODE>"]},"orole_odb01uw":{"@type":"d","@version":0,"servers":["odb01uw","odb01ue2","odb02ue2","odb02uw","<NEW_NODE>"]},"visitortrait_odb01uw":{"@type":"d","@version":0,"servers":["odb01uw","odb01ue2","odb02ue2","odb02uw","<NEW_NODE>"]},"license_odb01uw":{"@type":"d","@version":0,"servers":["odb01uw","odb01ue2","odb02ue2","odb02uw","<NEW_NODE>"]},"foreignidentifier_odb01uw":{"@type":"d","@version":0,"servers":["odb01uw","odb01ue2","odb02ue2","odb02uw","<NEW_NODE>"]},"event_odb01uw":{"@type":"d","@version":0,"servers":["odb01uw","odb01ue2","odb02ue2","odb02uw","<NEW_NODE>"]},"customer_odb01uw":{"@type":"d","@version":0,"servers":["odb01uw","odb01ue2","odb02ue2","odb02uw","<NEW_NODE>"]},"has_odb01uw":{"@type":"d","@version":0,"servers":["odb01uw","odb01ue2","odb02ue2","odb02uw","<NEW_NODE>"]},"_studio_odb02uw":{"@type":"d","@version":0,"servers":["odb02uw","odb01ue2","odb02ue2","odb01uw","<NEW_NODE>"]},"customer_odb02uw":{"@type":"d","@version":0,"servers":["odb02uw","odb01ue2","odb02ue2","odb01uw","<NEW_NODE>"]},"orids_odb02uw":{"@type":"d","@version":0,"servers":["odb02uw","odb01ue2","odb02ue2","odb01uw","<NEW_NODE>"]},"visitortrait_odb02uw":{"@type":"d","@version":0,"servers":["odb02uw","odb01ue2","odb02ue2","odb01uw","<NEW_NODE>"]},"license_odb02uw":{"@type":"d","@version":0,"servers":["odb02uw","odb01ue2","odb02ue2","odb01uw","<NEW_NODE>"]},"event_odb02uw":{"@type":"d","@version":0,"servers":["odb02uw","odb01ue2","odb02ue2","odb01uw","<NEW_NODE>"]},"triggered_odb02uw":{"@type":"d","@version":0,"servers":["odb02uw","odb01ue2","odb02ue2","odb01uw","<NEW_NODE>"]},"v_odb02uw":{"@type":"d","@version":0,"servers":["odb02uw","odb01ue2","odb02ue2","odb01uw","<NEW_NODE>"]},"orole_odb02uw":{"@type":"d","@version":0,"servers":["odb02uw","odb01ue2","odb02ue2","odb01uw","<NEW_NODE>"]},"foreignidentifier_odb02uw":{"@type":"d","@version":0,"servers":["odb02uw","odb01ue2","odb02ue2","odb01uw","<NEW_NODE>"]},"has_odb02uw":{"@type":"d","@version":0,"servers":["odb02uw","odb01ue2","odb02ue2","odb01uw","<NEW_NODE>"]},"eventtrait_odb02uw":{"@type":"d","@version":0,"servers":["odb02uw","odb01ue2","odb02ue2","odb01uw","<NEW_NODE>"]},"visitor_odb02uw":{"@type":"d","@version":0,"servers":["odb02uw","odb01ue2","odb02ue2","odb01uw","<NEW_NODE>"]},"ofunction_odb02uw":{"@type":"d","@version":0,"servers":["odb02uw","odb01ue2","odb02ue2","odb01uw","<NEW_NODE>"]},"belongsto_odb02uw":{"@type":"d","@version":0,"servers":["odb02uw","odb01ue2","odb02ue2","odb01uw","<NEW_NODE>"]},"e_odb02uw":{"@type":"d","@version":0,"servers":["odb02uw","odb01ue2","odb02ue2","odb01uw","<NEW_NODE>"]},"oschedule_odb02uw":{"@type":"d","@version":0,"servers":["odb02uw","odb01ue2","odb02ue2","odb01uw","<NEW_NODE>"]},"ouser_odb02uw":{"@type":"d","@version":0,"servers":["odb02uw","odb01ue2","odb02ue2","odb01uw","<NEW_NODE>"]}}}
>
>
> Amir.
>
>
>
> On Tuesday, March 24, 2015 at 1:02:46 PM UTC-5, Colin wrote:
>>
>> For some reason it's trying to reach a quorum of 4.
>>
>> Could you paste your database's distributed-config.json file please?
>>
>> -Colin
>>
>> On Tuesday, March 24, 2015 at 12:40:15 PM UTC-5, Amir Khawaja wrote:
>>>
>>> The cluster is now online in US East2 and US West. I did the following:
>>>
>>> - Changed the default-distributed-db-config.json to:
>>>
>>> {
>>>     "replication": true,
>>>     "autoDeploy": true,
>>>     "hotAlignment": false,
>>>     "resyncEvery": 15,
>>>     "clusters": {
>>>         "internal": {
>>>             "replication": false
>>>         },
>>>         "index": {
>>>             "replication": false
>>>         },
>>>         "*": {
>>>             "replication": true,
>>>             "readQuorum": 1,
>>>             "writeQuorum": 1,
>>>             "failureAvailableNodesLessQuorum": false,
>>>             "readYourWrites": true,
>>>             "partitioning": {
>>>                 "strategy": "round-robin",
>>>                 "default": 0,
>>>                 "partitions": [
>>>                     [ "<NEW_NODE>" ]
>>>                 ]
>>>             }
>>>         }
>>>     }
>>> }
>>>
>>> - Deleted the distributed-config.json file from each database folder and 
>>> restarted each node in the cluster.
>>>
>>> Now, when I connect to one of the nodes and try to delete a vertex, I 
>>> receive the following error:
>>>
>>> com.orientechnologies.orient.server.distributed.ODistributedException: 
>>> Error on executing distributed request (id=141 
>>> from=odb02uw task=command_sql(delete vertex #42:2) userName=) against 
>>> database 'vis.[]' to nodes [odb02ue2, odb02uw, 
>>> odb01uw, odb01ue2] --> 
>>> com.orientechnologies.orient.server.distributed.ODistributedException: 
>>> Quorum 4 not reached for 
>>> request (id=141 from=odb02uw task=command_sql(delete vertex #42:2) 
>>> userName=). Timeout=407ms Servers in timeout/
>>> conflict are: - odb02ue2: 
>>> com.orientechnologies.orient.core.exception.OCommandExecutionException: 
>>> Error on execution 
>>> of command: sql.delete vertex #42:2 - odb01ue2: 
>>> com.orientechnologies.orient.core.exception.
>>> OCommandExecutionException: Error on execution of command: sql.delete 
>>> vertex #42:2 - odb01uw: com.orientechnologies.
>>> orient.core.exception.OCommandExecutionException: Error on execution of 
>>> command: sql.delete vertex #42:2 Received: 
>>> {odb02uw=com.orientechnologies.orient.core.exception.OCommandExecutionException:
>>>  
>>> Error on execution of command: sql.
>>> delete vertex #42:2, 
>>> odb01uw=com.orientechnologies.orient.core.exception.OCommandExecutionException:
>>>  
>>> Error on 
>>> execution of command: sql.delete vertex #42:2, 
>>> odb02ue2=com.orientechnologies.orient.core.exception.
>>> OCommandExecutionException: Error on execution of command: sql.delete 
>>> vertex #42:2, odb01ue2=com.orientechnologies.
>>> orient.core.exception.OCommandExecutionException: Error on execution of 
>>> command: sql.delete vertex #42:2}
>>>
>>> Why am I not able to delete a vertex?
>>>
>>> Amir.
>>>
>>>
>>> On Tuesday, March 24, 2015 at 12:20:37 PM UTC-5, Colin wrote:
>>>>
>>>> That latency should be fine so long as it's consistent.
>>>>
>>>> -Colin
>>>>
>>>> On Tuesday, March 24, 2015 at 11:52:58 AM UTC-5, Amir Khawaja wrote:
>>>>>
>>>>> Hi Colin,
>>>>>
>>>>> I checked the latency prior to posting and between regions it is about 
>>>>> 65ms on average. What should I set the latency to for Hazelcast?
>>>>>
>>>>> Amir.
>>>>>
>>>>> On Tuesday, March 24, 2015 at 11:49:25 AM UTC-5, Colin wrote:
>>>>>>
>>>>>> Hi Amir,
>>>>>>
>>>>>> You might also do a ping and a traceroute between the machines and 
>>>>>> see what kind of latency you're getting, just in case it's a timeout 
>>>>>> issue 
>>>>>> with Hazelcast.
>>>>>>
>>>>>> -Colin
>>>>>>
>>>>>> On Tuesday, March 24, 2015 at 11:32:21 AM UTC-5, Amir Khawaja wrote:
>>>>>>>
>>>>>>> Hi Colin,
>>>>>>>
>>>>>>> Thank you for the prompt response.
>>>>>>>
>>>>>>> I'm a little confused as you say "the US West node will not come 
>>>>>>>> online telling me that the database is not yet online.  At that point, 
>>>>>>>> I 
>>>>>>>> kill the process and then eventually the database comes online."
>>>>>>>
>>>>>>> Do you mean you kill the database process and then restart it and 
>>>>>>>> then it starts communicating? 
>>>>>>>
>>>>>>>
>>>>>>> Yes. I kill the database process on the cluster node where the 
>>>>>>> OrientDB is not coming online.
>>>>>>>
>>>>>>> Can you see on each machine when Hazelcast 'sees' all the members? 
>>>>>>>>  Are all the members showing up?
>>>>>>>
>>>>>>>
>>>>>>> Yes. I see the databases are talking to each other as the IP address 
>>>>>>> of the nodes show up in the log of each database server.
>>>>>>>
>>>>>>> I will try setting hotAlignment to false and report my results on 
>>>>>>> this thread.
>>>>>>>
>>>>>>> Amir.
>>>>>>>
>>>>>>>
>>>>>>> On Tuesday, March 24, 2015 at 11:25:16 AM UTC-5, Colin wrote:
>>>>>>>>
>>>>>>>> Hi Amir,
>>>>>>>>
>>>>>>>> Is it consistently a problem between the same machines not seeing 
>>>>>>>> each other?
>>>>>>>>
>>>>>>>> I'm a little confused as you say "the US West node will not come 
>>>>>>>> online telling me that the database is not yet online.  At that point, 
>>>>>>>> I 
>>>>>>>> kill the process and then eventually the database comes online."
>>>>>>>>
>>>>>>>> Do you mean you kill the database process and then restart it and 
>>>>>>>> then it starts communicating?
>>>>>>>>
>>>>>>>> In your distributed json file, try setting "hotAlignment" to false.
>>>>>>>>
>>>>>>>> Can you see on each machine when Hazelcast 'sees' all the members? 
>>>>>>>>  Are all the members showing up?
>>>>>>>>
>>>>>>>> -Colin
>>>>>>>>
>>>>>>>> Orient Technologies
>>>>>>>>
>>>>>>>> The Company behind OrientDB
>>>>>>>>
>>>>>>>> On Tuesday, March 24, 2015 at 11:19:05 AM UTC-5, Amir Khawaja wrote:
>>>>>>>>>
>>>>>>>>> Greetings, everyone. Has anyone had much success running an 
>>>>>>>>> OrientDB 2.0.5 cluster in Azure? I created a cluster in Windows Azure 
>>>>>>>>> with 
>>>>>>>>> 4 nodes using CentOS 7 and OrientDB Community 2.0.4 -- 2 nodes in US 
>>>>>>>>> East2 
>>>>>>>>> and 2 nodes in US West. There is a Site-to-Site VPN connection 
>>>>>>>>> between the 
>>>>>>>>> two regions in Azure and data is flowing between machines across the 
>>>>>>>>> network. I have three databases that I have currently deployed and 
>>>>>>>>> testing. 
>>>>>>>>> I find that many times the synchronization between databases does not 
>>>>>>>>> occur. For instance, if I startup the first node in US East2 and once 
>>>>>>>>> that 
>>>>>>>>> comes online, fire up the second node in US West, the US West node 
>>>>>>>>> will not 
>>>>>>>>> come online telling me that the database is not yet online. At that 
>>>>>>>>> point, 
>>>>>>>>> I kill the process and then eventually the database comes online. I 
>>>>>>>>> even 
>>>>>>>>> have to go so far as to delete the databases in the database path 
>>>>>>>>> folder. I 
>>>>>>>>> do this a few times and eventually the server may startup. Sometimes, 
>>>>>>>>> I 
>>>>>>>>> will have three of the four nodes working and the fourth just refuses 
>>>>>>>>> to 
>>>>>>>>> come online. 
>>>>>>>>>
>>>>>>>>> The VM size selected for each node in the cluster is a D4 (4 
>>>>>>>>> cores, 28GB RAM). This should be more than sufficient to handle most 
>>>>>>>>> loads. 
>>>>>>>>> Surely, I must be missing something as this is not acceptable 
>>>>>>>>> production 
>>>>>>>>> behavior. For reference, I am pasting the hazelcast.xml and 
>>>>>>>>> default-distributed-db-config.json files here in hopes that someone 
>>>>>>>>> has 
>>>>>>>>> some pointers for me.
>>>>>>>>>
>>>>>>>>> *** hazelcast.xml ***
>>>>>>>>>
>>>>>>>>> <?xml version="1.0" encoding="UTF-8"?>
>>>>>>>>> <!-- ~ Copyright (c) 2008-2012, Hazel Bilisim Ltd. All Rights 
>>>>>>>>> Reserved. ~
>>>>>>>>> ~ Licensed under the Apache License, Version 2.0 (the "License"); 
>>>>>>>>> ~ you may
>>>>>>>>> not use this file except in compliance with the License. ~ You may 
>>>>>>>>> obtain
>>>>>>>>> a copy of the License at ~ ~ 
>>>>>>>>> http://www.apache.org/licenses/LICENSE-2.0 ~
>>>>>>>>> ~ Unless required by applicable law or agreed to in writing, 
>>>>>>>>> software ~ distributed
>>>>>>>>> under the License is distributed on an "AS IS" BASIS, ~ WITHOUT 
>>>>>>>>> WARRANTIES
>>>>>>>>> OR CONDITIONS OF ANY KIND, either express or implied. ~ See the 
>>>>>>>>> License for
>>>>>>>>> the specific language governing permissions and ~ limitations 
>>>>>>>>> under the License. -->
>>>>>>>>>
>>>>>>>>> <hazelcast
>>>>>>>>> xsi:schemaLocation="http://www.hazelcast.com/schema/config 
>>>>>>>>> hazelcast-config-3.0.xsd"
>>>>>>>>> xmlns="http://www.hazelcast.com/schema/config"; xmlns:xsi="
>>>>>>>>> http://www.w3.org/2001/XMLSchema-instance";>
>>>>>>>>> <group>
>>>>>>>>> <name>[name]</name>
>>>>>>>>> <password>[password]</password>
>>>>>>>>> </group>
>>>>>>>>> <network>
>>>>>>>>> <port auto-increment="true">2434</port>
>>>>>>>>> <join>
>>>>>>>>> <multicast enabled="false">
>>>>>>>>> <multicast-group>235.1.1.1</multicast-group>
>>>>>>>>> <multicast-port>2434</multicast-port>
>>>>>>>>> </multicast>
>>>>>>>>> <tcp-ip enabled="true">
>>>>>>>>> <member>10.0.0.4</member>
>>>>>>>>> <member>10.0.0.5</member>
>>>>>>>>> <member>10.1.0.4</member>
>>>>>>>>> <member>10.1.0.5</member>
>>>>>>>>> </tcp-ip>
>>>>>>>>> </join>
>>>>>>>>> </network>
>>>>>>>>> <executor-service>
>>>>>>>>> <pool-size>16</pool-size>
>>>>>>>>> </executor-service>
>>>>>>>>> </hazelcast>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> *** default-distributed-db-config.json ***
>>>>>>>>>
>>>>>>>>> {
>>>>>>>>>     "autoDeploy": true,
>>>>>>>>>     "hotAlignment": true,
>>>>>>>>>     "executionMode": "synchronous",
>>>>>>>>>     "readQuorum": 1,
>>>>>>>>>     "writeQuorum": 3,
>>>>>>>>>     "failureAvailableNodesLessQuorum": false,
>>>>>>>>>     "readYourWrites": true,
>>>>>>>>>     "clusters": {
>>>>>>>>>         "internal": {
>>>>>>>>>         },
>>>>>>>>>         "index": {
>>>>>>>>>         },
>>>>>>>>>         "*": {
>>>>>>>>>             "servers" : [ "<NEW_NODE>" ]
>>>>>>>>>         }
>>>>>>>>>     }
>>>>>>>>> }
>>>>>>>>>
>>>>>>>>> Thank you for any assistance you can offer.
>>>>>>>>>
>>>>>>>>> Amir.
>>>>>>>>>
>>>>>>>>

-- 

--- 
You received this message because you are subscribed to the Google Groups 
"OrientDB" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
For more options, visit https://groups.google.com/d/optout.

[orientdb] Re: Observed issues with running a cluster in Windows Azure

Reply via email to