Reputation: 6408
I'm working with a 10-node Infinispan cluster used as a Hibernate Search backend. Our servers are running TC server 2.5 (tomcat 6.0.32) on Java 1.6_24. We are using jGroups 2.12.1.3 for handling cluster cache writes from each node, and for multicast UDP transport.
When we launch 3+ nodes in our cluster, eventually one of the nodes begins to log replication timeouts. We've observed the same result whether we configure Infinispan for replication or for distribution cache modes. Although the rest of the cluster remains stable, the failing node becomes essentially unsuable for search.
Our configuration:
Infinispan:
<?xml version="1.0" encoding="UTF-8"?>
<infinispan
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="urn:infinispan:config:5.0 http://www.infinispan.org/schemas/infinispan-config-5.0.xsd"
xmlns="urn:infinispan:config:5.0">
<global>
<globalJmxStatistics
enabled="true"
cacheManagerName="HibernateSearch"
allowDuplicateDomains="true" />
<transport
clusterName="HibernateSearch-Infinispan-cluster-MT"
distributedSyncTimeout="50000">
<properties>
<property name="configurationFile" value="infinispan-udp.cfg.xml" />
</properties>
</transport>
<shutdown
hookBehavior="DONT_REGISTER" />
</global>
<default>
<locking
lockAcquisitionTimeout="20000"
writeSkewCheck="false"
concurrencyLevel="5000"
useLockStriping="false" />
<storeAsBinary storeKeysAsBinary="false" storeValuesAsBinary="true"
enabled="false" />
<invocationBatching
enabled="true" />
<clustering
mode="replication">
<stateRetrieval
timeout="60000"
logFlushTimeout="65000"
fetchInMemoryState="true"
alwaysProvideInMemoryState="true" />
<sync
replTimeout="50000" />
<l1 enabled="false" />
</clustering>
<jmxStatistics
enabled="true" />
<eviction
maxEntries="-1"
strategy="NONE" />
<expiration
maxIdle="-1" />
</default>
<namedCache
name="LuceneIndexesMetadata">
<clustering
mode="replication">
<stateRetrieval
fetchInMemoryState="true"
logFlushTimeout="30000" />
<sync
replTimeout="50000" />
<l1 enabled="false" />
</clustering>
<locking
lockAcquisitionTimeout="20000"
writeSkewCheck="false"
concurrencyLevel="5000"
useLockStriping="false" />
<loaders shared="true" preload="true">
<loader class="org.infinispan.loaders.jdbm.JdbmCacheStore" fetchPersistentState="false" ignoreModifications="false" purgeOnStartup="false">
<properties>
<property name="location" value="/usr/local/tc/.index/metadata" />
</properties>
</loader>
</loaders>
</namedCache>
<namedCache
name="LuceneIndexesData">
<clustering
mode="replication">
<stateRetrieval
fetchInMemoryState="true"
logFlushTimeout="30000" />
<sync
replTimeout="50000" />
<l1 enabled="false" />
</clustering>
<locking
lockAcquisitionTimeout="20000"
writeSkewCheck="false"
concurrencyLevel="5000"
useLockStriping="false" />
<loaders shared="true" preload="true">
<loader class="org.infinispan.loaders.jdbm.JdbmCacheStore" fetchPersistentState="false" ignoreModifications="false" purgeOnStartup="false">
<properties>
<property name="location" value="/usr/local/tc/.index/data" />
</properties>
</loader>
</loaders>
</namedCache>
<namedCache
name="LuceneIndexesLocking">
<clustering
mode="replication">
<stateRetrieval
fetchInMemoryState="true"
logFlushTimeout="30000" />
<sync
replTimeout="50000" />
<l1 enabled="false" />
</clustering>
<locking
lockAcquisitionTimeout="20000"
writeSkewCheck="false"
concurrencyLevel="5000"
useLockStriping="false" />
</namedCache>
jGroups (UDP):
<config xmlns="urn:org:jgroups"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="urn:org:jgroups http://www.jgroups.org/schema/JGroups-2.12.xsd">
<UDP
mcast_addr="${jgroups.udp.mcast_addr:228.10.10.9}"
mcast_port="${jgroups.udp.mcast_port:45599}"
tos="8"
ucast_recv_buf_size="20000000"
ucast_send_buf_size="640000"
mcast_recv_buf_size="25000000"
mcast_send_buf_size="640000"
loopback="true"
discard_incompatible_packets="true"
max_bundle_size="64000"
max_bundle_timeout="30"
ip_ttl="${jgroups.udp.ip_ttl:2}"
enable_bundling="true"
enable_diagnostics="false"
thread_naming_pattern="pl"
thread_pool.enabled="true"
thread_pool.min_threads="2"
thread_pool.max_threads="30"
thread_pool.keep_alive_time="5000"
thread_pool.queue_enabled="false"
thread_pool.queue_max_size="100"
thread_pool.rejection_policy="Discard"
oob_thread_pool.enabled="true"
oob_thread_pool.min_threads="2"
oob_thread_pool.max_threads="30"
oob_thread_pool.keep_alive_time="5000"
oob_thread_pool.queue_enabled="false"
oob_thread_pool.queue_max_size="100"
oob_thread_pool.rejection_policy="Discard"
/>
And the errors we observe:
10-31-2011 13:53:02 ERROR Hibernate Search: Directory writer-3 interceptors.InvocationContextInterceptor: ISPN000136: Execution error
org.infinispan.util.concurrent.TimeoutException: Replication timeout for tc-cluster-0105-21082
at org.infinispan.remoting.transport.AbstractTransport.parseResponseAndAddToResponseList(AbstractTransport.java:71)
at org.infinispan.remoting.transport.jgroups.JGroupsTransport.invokeRemotely(JGroupsTransport.java:452)
at org.infinispan.remoting.rpc.RpcManagerImpl.invokeRemotely(RpcManagerImpl.java:132)
at org.infinispan.remoting.rpc.RpcManagerImpl.invokeRemotely(RpcManagerImpl.java:156)
at org.infinispan.remoting.rpc.RpcManagerImpl.invokeRemotely(RpcManagerImpl.java:265)
at org.infinispan.remoting.rpc.RpcManagerImpl.invokeRemotely(RpcManagerImpl.java:252)
at org.infinispan.remoting.rpc.RpcManagerImpl.broadcastRpcCommand(RpcManagerImpl.java:235)
at org.infinispan.remoting.rpc.RpcManagerImpl.broadcastRpcCommand(RpcManagerImpl.java:228)
at org.infinispan.interceptors.ReplicationInterceptor.handleCrudMethod(ReplicationInterceptor.java:116)
at org.infinispan.interceptors.ReplicationInterceptor.visitPutKeyValueCommand(ReplicationInterceptor.java:79)
at org.infinispan.commands.write.PutKeyValueCommand.acceptVisitor(PutKeyValueCommand.java:77)
at org.infinispan.interceptors.base.CommandInterceptor.invokeNextInterceptor(CommandInterceptor.java:119)
at org.infinispan.interceptors.LockingInterceptor.visitPutKeyValueCommand(LockingInterceptor.java:294)
at org.infinispan.commands.write.PutKeyValueCommand.acceptVisitor(PutKeyValueCommand.java:77)
at org.infinispan.interceptors.base.CommandInterceptor.invokeNextInterceptor(CommandInterceptor.java:119)
at org.infinispan.interceptors.base.CommandInterceptor.handleDefault(CommandInterceptor.java:133)
at org.infinispan.commands.AbstractVisitor.visitPutKeyValueCommand(AbstractVisitor.java:60)
at org.infinispan.commands.write.PutKeyValueCommand.acceptVisitor(PutKeyValueCommand.java:77)
at org.infinispan.interceptors.base.CommandInterceptor.invokeNextInterceptor(CommandInterceptor.java:119)
at org.infinispan.interceptors.TxInterceptor.enlistWriteAndInvokeNext(TxInterceptor.java:214)
at org.infinispan.interceptors.TxInterceptor.visitPutKeyValueCommand(TxInterceptor.java:162)
at org.infinispan.commands.write.PutKeyValueCommand.acceptVisitor(PutKeyValueCommand.java:77)
at org.infinispan.interceptors.base.CommandInterceptor.invokeNextInterceptor(CommandInterceptor.java:119)
at org.infinispan.interceptors.CacheMgmtInterceptor.visitPutKeyValueCommand(CacheMgmtInterceptor.java:114)
at org.infinispan.commands.write.PutKeyValueCommand.acceptVisitor(PutKeyValueCommand.java:77)
at org.infinispan.interceptors.base.CommandInterceptor.invokeNextInterceptor(CommandInterceptor.java:119)
at org.infinispan.interceptors.InvocationContextInterceptor.handleAll(InvocationContextInterceptor.java:104)
at org.infinispan.interceptors.InvocationContextInterceptor.handleDefault(InvocationContextInterceptor.java:64)
at org.infinispan.commands.AbstractVisitor.visitPutKeyValueCommand(AbstractVisitor.java:60)
at org.infinispan.commands.write.PutKeyValueCommand.acceptVisitor(PutKeyValueCommand.java:77)
at org.infinispan.interceptors.base.CommandInterceptor.invokeNextInterceptor(CommandInterceptor.java:119)
at org.infinispan.interceptors.BatchingInterceptor.handleDefault(BatchingInterceptor.java:77)
at org.infinispan.commands.AbstractVisitor.visitPutKeyValueCommand(AbstractVisitor.java:60)
at org.infinispan.commands.write.PutKeyValueCommand.acceptVisitor(PutKeyValueCommand.java:77)
at org.infinispan.interceptors.InterceptorChain.invoke(InterceptorChain.java:274)
at org.infinispan.CacheImpl.putIfAbsent(CacheImpl.java:524)
at org.infinispan.CacheSupport.putIfAbsent(CacheSupport.java:74)
at org.infinispan.lucene.locking.BaseLuceneLock.obtain(BaseLuceneLock.java:65)
at org.apache.lucene.store.Lock.obtain(Lock.java:72)
at org.apache.lucene.index.IndexWriter.<init>(IndexWriter.java:1097)
at org.hibernate.search.backend.Workspace.createNewIndexWriter(Workspace.java:202)
at org.hibernate.search.backend.Workspace.getIndexWriter(Workspace.java:180)
at org.hibernate.search.backend.impl.lucene.PerDPQueueProcessor.run(PerDPQueueProcessor.java:103)
at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:886)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:908)
at java.lang.Thread.run(Thread.java:662)
Because this error is so pervasive regardless of our topology or caching mode, we believe we must be misconfigured somewhere. Can anyone recommend a fix?
Upvotes: 3
Views: 4382
Reputation: 6408
Turns out we had a version clash between Infinispan and Hibernate Search. If you use Hibernate Search 3.4.1, you must use Infinispan 4.2.1, later versions may not work.
Upvotes: 2