001/*
002 * The contents of this file are subject to the terms of the Common Development and
003 * Distribution License (the License). You may not use this file except in compliance with the
004 * License.
005 *
006 * You can obtain a copy of the License at legal/CDDLv1.0.txt. See the License for the
007 * specific language governing permission and limitations under the License.
008 *
009 * When distributing Covered Software, include this CDDL Header Notice in each file and include
010 * the License file at legal/CDDLv1.0.txt. If applicable, add the following below the CDDL
011 * Header, with the fields enclosed by brackets [] replaced by your own identifying
012 * information: "Portions Copyright [year] [name of copyright owner]".
013 *
014 * Copyright 2006-2010 Sun Microsystems, Inc.
015 * Portions Copyright 2011-2016 ForgeRock AS.
016 */
017package org.opends.server.replication.server;
018
019import java.io.IOException;
020import java.util.ArrayList;
021import java.util.Collection;
022import java.util.Collections;
023import java.util.HashMap;
024import java.util.List;
025import java.util.Map;
026import java.util.Map.Entry;
027import java.util.Timer;
028import java.util.TimerTask;
029import java.util.concurrent.ConcurrentHashMap;
030import java.util.concurrent.TimeUnit;
031import java.util.concurrent.atomic.AtomicReference;
032import java.util.concurrent.locks.ReentrantLock;
033
034import net.jcip.annotations.GuardedBy;
035
036import org.forgerock.i18n.LocalizableMessage;
037import org.forgerock.i18n.LocalizableMessageBuilder;
038import org.forgerock.i18n.slf4j.LocalizedLogger;
039import org.forgerock.opendj.ldap.ResultCode;
040import org.opends.server.admin.std.server.MonitorProviderCfg;
041import org.opends.server.api.MonitorData;
042import org.opends.server.api.MonitorProvider;
043import org.opends.server.core.DirectoryServer;
044import org.opends.server.replication.common.CSN;
045import org.opends.server.replication.common.DSInfo;
046import org.opends.server.replication.common.RSInfo;
047import org.opends.server.replication.common.ServerState;
048import org.opends.server.replication.common.ServerStatus;
049import org.opends.server.replication.common.StatusMachineEvent;
050import org.opends.server.replication.protocol.AckMsg;
051import org.opends.server.replication.protocol.ChangeStatusMsg;
052import org.opends.server.replication.protocol.ChangeTimeHeartbeatMsg;
053import org.opends.server.replication.protocol.ErrorMsg;
054import org.opends.server.replication.protocol.MonitorMsg;
055import org.opends.server.replication.protocol.MonitorRequestMsg;
056import org.opends.server.replication.protocol.ReplicaOfflineMsg;
057import org.opends.server.replication.protocol.ResetGenerationIdMsg;
058import org.opends.server.replication.protocol.RoutableMsg;
059import org.opends.server.replication.protocol.TopologyMsg;
060import org.opends.server.replication.protocol.UpdateMsg;
061import org.opends.server.replication.server.changelog.api.ChangelogException;
062import org.opends.server.replication.server.changelog.api.DBCursor;
063import org.opends.server.replication.server.changelog.api.DBCursor.CursorOptions;
064import org.opends.server.replication.server.changelog.api.ReplicationDomainDB;
065import org.forgerock.opendj.ldap.DN;
066import org.opends.server.types.DirectoryException;
067import org.opends.server.types.HostPort;
068
069import static org.opends.messages.ReplicationMessages.*;
070import static org.opends.server.replication.common.ServerStatus.*;
071import static org.opends.server.replication.common.StatusMachineEvent.*;
072import static org.opends.server.replication.protocol.ProtocolVersion.*;
073import static org.opends.server.replication.server.changelog.api.DBCursor.KeyMatchingStrategy.*;
074import static org.opends.server.replication.server.changelog.api.DBCursor.PositionStrategy.*;
075import static org.opends.server.util.CollectionUtils.*;
076import static org.opends.server.util.StaticUtils.*;
077
078/**
079 * This class define an in-memory cache that will be used to store
080 * the messages that have been received from an LDAP server or
081 * from another replication server and that should be forwarded to
082 * other servers.
083 *
084 * The size of the cache is set by configuration.
085 * If the cache becomes bigger than the configured size, the older messages
086 * are removed and should they be needed again must be read from the backing
087 * file
088 *
089 * it runs a thread that is responsible for saving the messages
090 * received to the disk and for trimming them
091 * Decision to trim can be based on disk space or age of the message
092 */
093public class ReplicationServerDomain extends MonitorProvider<MonitorProviderCfg>
094{
095  private final DN baseDN;
096
097  /**
098   * Periodically verifies whether the connected DSs are late and publishes any
099   * pending status messages.
100   */
101  private final StatusAnalyzer statusAnalyzer;
102
103  /**
104   * The monitoring publisher that periodically sends monitoring messages to the
105   * topology. Using an AtomicReference to avoid leaking references to costly
106   * threads.
107   */
108  private final AtomicReference<MonitoringPublisher> monitoringPublisher = new AtomicReference<>();
109  /** Maintains monitor data for the current domain. */
110  private final ReplicationDomainMonitor domainMonitor = new ReplicationDomainMonitor(this);
111
112  /**
113   * The following map contains one balanced tree for each replica ID to which
114   * we are currently publishing the first update in the balanced tree is the
115   * next change that we must push to this particular server.
116   */
117  private final Map<Integer, DataServerHandler> connectedDSs = new ConcurrentHashMap<>();
118
119  /**
120   * This map contains one ServerHandler for each replication servers with which
121   * we are connected (so normally all the replication servers) the first update
122   * in the balanced tree is the next change that we must push to this
123   * particular server.
124   */
125  private final Map<Integer, ReplicationServerHandler> connectedRSs = new ConcurrentHashMap<>();
126
127  private final ReplicationDomainDB domainDB;
128  /** The ReplicationServer that created the current instance. */
129  private final ReplicationServer localReplicationServer;
130
131  /**
132   * The generationId of the current replication domain. The generationId is
133   * computed by hashing the first 1000 entries in the DB.
134   */
135  private volatile long generationId = -1;
136  /**
137   * JNR, this is legacy code, hard to follow logic. I think what this field
138   * tries to say is: "is the generationId in use anywhere?", i.e. is there a
139   * replication topology in place? As soon as an answer to any of these
140   * question comes true, then it is set to true.
141   * <p>
142   * It looks like the only use of this field is to prevent the
143   * {@link #generationId} from being reset by
144   * {@link #resetGenerationIdIfPossible()}.
145   */
146  private volatile boolean generationIdSavedStatus;
147
148  /** The tracer object for the debug logger. */
149  private static final LocalizedLogger logger = LocalizedLogger.getLoggerForThisClass();
150
151  /**
152   * The needed info for each received assured update message we are waiting
153   * acks for.
154   * <p>
155   * Key: a CSN matching a received update message which requested
156   * assured mode usage (either safe read or safe data mode)
157   * <p>
158   * Value: The object holding every info needed about the already received acks
159   * as well as the acks to be received.
160   *
161   * @see ExpectedAcksInfo For more details, see ExpectedAcksInfo and its sub
162   *      classes javadoc.
163   */
164  private final Map<CSN, ExpectedAcksInfo> waitingAcks = new ConcurrentHashMap<>();
165
166  /**
167   * The timer used to run the timeout code (timer tasks) for the assured update
168   * messages we are waiting acks for.
169   */
170  private final Timer assuredTimeoutTimer;
171  /**
172   * Counter used to purge the timer tasks references in assuredTimeoutTimer,
173   * every n number of treated assured messages.
174   */
175  private int assuredTimeoutTimerPurgeCounter;
176
177
178
179  /**
180   * Stores pending status messages such as DS change time heartbeats for future
181   * forwarding to the rest of the topology. This class is required in order to
182   * decouple inbound IO processing from outbound IO processing and avoid
183   * potential inter-process deadlocks. In particular, the {@code ServerReader}
184   * thread must not send messages.
185   */
186  private static class PendingStatusMessages
187  {
188    private final Map<Integer, ChangeTimeHeartbeatMsg> pendingHeartbeats = new HashMap<>(1);
189    private final Map<Integer, MonitorMsg> pendingDSMonitorMsgs = new HashMap<>(1);
190    private final Map<Integer, MonitorMsg> pendingRSMonitorMsgs = new HashMap<>(1);
191    private boolean sendRSTopologyMsg;
192    private boolean sendDSTopologyMsg;
193    private int excludedDSForTopologyMsg = -1;
194
195    /**
196     * Enqueues a TopologyMsg for all the connected directory servers in order
197     * to let them know the topology (every known DSs and RSs).
198     *
199     * @param excludedDS
200     *          If not null, the topology message will not be sent to this DS.
201     */
202    private void enqueueTopoInfoToAllDSsExcept(DataServerHandler excludedDS)
203    {
204      int excludedServerId = excludedDS != null ? excludedDS.getServerId() : -1;
205      if (sendDSTopologyMsg)
206      {
207        if (excludedServerId != excludedDSForTopologyMsg)
208        {
209          excludedDSForTopologyMsg = -1;
210        }
211      }
212      else
213      {
214        sendDSTopologyMsg = true;
215        excludedDSForTopologyMsg = excludedServerId;
216      }
217    }
218
219    /**
220     * Enqueues a TopologyMsg for all the connected replication servers in order
221     * to let them know our connected LDAP servers.
222     */
223    private void enqueueTopoInfoToAllRSs()
224    {
225      sendRSTopologyMsg = true;
226    }
227
228    /**
229     * Enqueues a ChangeTimeHeartbeatMsg received from a DS for forwarding to
230     * all other RS instances.
231     *
232     * @param msg
233     *          The heartbeat message.
234     */
235    private void enqueueChangeTimeHeartbeatMsg(ChangeTimeHeartbeatMsg msg)
236    {
237      pendingHeartbeats.put(msg.getCSN().getServerId(), msg);
238    }
239
240    private void enqueueDSMonitorMsg(int dsServerId, MonitorMsg msg)
241    {
242      pendingDSMonitorMsgs.put(dsServerId, msg);
243    }
244
245    private void enqueueRSMonitorMsg(int rsServerId, MonitorMsg msg)
246    {
247      pendingRSMonitorMsgs.put(rsServerId, msg);
248    }
249
250    /** {@inheritDoc} */
251    @Override
252    public String toString()
253    {
254      return getClass().getSimpleName()
255          + " pendingHeartbeats=" + pendingHeartbeats
256          + ", pendingDSMonitorMsgs=" + pendingDSMonitorMsgs
257          + ", pendingRSMonitorMsgs=" + pendingRSMonitorMsgs
258          + ", sendRSTopologyMsg=" + sendRSTopologyMsg
259          + ", sendDSTopologyMsg=" + sendDSTopologyMsg
260          + ", excludedDSForTopologyMsg=" + excludedDSForTopologyMsg;
261    }
262  }
263
264  private final Object pendingStatusMessagesLock = new Object();
265
266  @GuardedBy("pendingStatusMessagesLock")
267  private PendingStatusMessages pendingStatusMessages = new PendingStatusMessages();
268
269  /**
270   * Creates a new ReplicationServerDomain associated to the baseDN.
271   *
272   * @param baseDN
273   *          The baseDN associated to the ReplicationServerDomain.
274   * @param localReplicationServer
275   *          the ReplicationServer that created this instance.
276   */
277  public ReplicationServerDomain(DN baseDN,
278      ReplicationServer localReplicationServer)
279  {
280    this.baseDN = baseDN;
281    this.localReplicationServer = localReplicationServer;
282    this.assuredTimeoutTimer = new Timer("Replication server RS("
283        + localReplicationServer.getServerId()
284        + ") assured timer for domain \"" + baseDN + "\"", true);
285    this.domainDB =
286        localReplicationServer.getChangelogDB().getReplicationDomainDB();
287    this.statusAnalyzer = new StatusAnalyzer(this);
288    this.statusAnalyzer.start();
289    DirectoryServer.registerMonitorProvider(this);
290  }
291
292  /**
293   * Add an update that has been received to the list of
294   * updates that must be forwarded to all other servers.
295   *
296   * @param updateMsg  The update that has been received.
297   * @param sourceHandler The ServerHandler for the server from which the
298   *        update was received
299   * @throws IOException When an IO exception happens during the update
300   *         processing.
301   */
302  public void put(UpdateMsg updateMsg, ServerHandler sourceHandler) throws IOException
303  {
304    sourceHandler.updateServerState(updateMsg);
305    sourceHandler.incrementInCount();
306    setGenerationIdIfUnset(sourceHandler.getGenerationId());
307
308    /**
309     * If this is an assured message (a message requesting ack), we must
310     * construct the ExpectedAcksInfo object with the right number of expected
311     * acks before posting message to the writers. Otherwise some writers may
312     * have time to post, receive the ack and increment received ack counter
313     * (kept in ExpectedAcksInfo object) and we could think the acknowledgment
314     * is fully processed although it may be not (some other acks from other
315     * servers are not yet arrived). So for that purpose we do a pre-loop
316     * to determine to who we will post an assured message.
317     * Whether the assured mode is safe read or safe data, we anyway do not
318     * support the assured replication feature across topologies with different
319     * group ids. The assured feature insures assured replication based on the
320     * same locality (group id). For instance in double data center deployment
321     * (2 group id usage) with assured replication enabled, an assured message
322     * sent from data center 1 (group id = 1) will be sent to servers of both
323     * data centers, but one will request and wait acks only from servers of the
324     * data center 1.
325     */
326    final PreparedAssuredInfo preparedAssuredInfo = getPreparedAssuredInfo(updateMsg, sourceHandler);
327
328    if (!publishUpdateMsg(updateMsg))
329    {
330      return;
331    }
332
333    final List<Integer> assuredServers = getAssuredServers(updateMsg, preparedAssuredInfo);
334
335    /**
336     * The update message equivalent to the originally received update message,
337     * but with assured flag disabled. This message is the one that should be
338     * sent to non eligible servers for assured mode.
339     * We need a clone like of the original message with assured flag off, to be
340     * posted to servers we don't want to wait the ack from (not normal status
341     * servers or servers with different group id). This must be done because
342     * the posted message is a reference so each writer queue gets the same
343     * reference, thus, changing the assured flag of an object is done for every
344     * references posted on every writer queues. That is why we need a message
345     * version with assured flag on and another one with assured flag off.
346     */
347    final NotAssuredUpdateMsg notAssuredUpdateMsg =
348        preparedAssuredInfo != null ? new NotAssuredUpdateMsg(updateMsg) : null;
349
350    // Push the message to the replication servers
351    if (sourceHandler.isDataServer())
352    {
353      for (ReplicationServerHandler rsHandler : connectedRSs.values())
354      {
355        /**
356         * Ignore updates to RS with bad gen id
357         * (no system managed status for a RS)
358         */
359        if (!isDifferentGenerationId(rsHandler, updateMsg))
360        {
361          addUpdate(rsHandler, updateMsg, notAssuredUpdateMsg, assuredServers);
362        }
363      }
364    }
365
366    // Push the message to the LDAP servers
367    for (DataServerHandler dsHandler : connectedDSs.values())
368    {
369      // Do not forward the change to the server that just sent it
370      if (dsHandler != sourceHandler
371          && !isUpdateMsgFiltered(updateMsg, dsHandler))
372      {
373        addUpdate(dsHandler, updateMsg, notAssuredUpdateMsg, assuredServers);
374      }
375    }
376  }
377
378  private boolean isDifferentGenerationId(ReplicationServerHandler rsHandler,
379      UpdateMsg updateMsg)
380  {
381    final boolean isDifferent = isDifferentGenerationId(rsHandler.getGenerationId());
382    if (isDifferent && logger.isTraceEnabled())
383    {
384      debug("updateMsg " + updateMsg.getCSN()
385          + " will not be sent to replication server "
386          + rsHandler.getServerId() + " with generation id "
387          + rsHandler.getGenerationId() + " different from local "
388          + "generation id " + generationId);
389    }
390    return isDifferent;
391  }
392
393  /**
394   * Ignore updates to DS in bad BAD_GENID_STATUS or FULL_UPDATE_STATUS.
395   * <p>
396   * The RSD lock should not be taken here as it is acceptable to have a delay
397   * between the time the server has a wrong status and the fact we detect it:
398   * the updates that succeed to pass during this time will have no impact on
399   * remote server. But it is interesting to not saturate uselessly the network
400   * if the updates are not necessary so this check to stop sending updates is
401   * interesting anyway. Not taking the RSD lock allows to have better
402   * performances in normal mode (most of the time).
403   */
404  private boolean isUpdateMsgFiltered(UpdateMsg updateMsg, DataServerHandler dsHandler)
405  {
406    final ServerStatus dsStatus = dsHandler.getStatus();
407    if (dsStatus == ServerStatus.BAD_GEN_ID_STATUS)
408    {
409      if (logger.isTraceEnabled())
410      {
411        debug("updateMsg " + updateMsg.getCSN()
412            + " will not be sent to directory server "
413            + dsHandler.getServerId() + " with generation id "
414            + dsHandler.getGenerationId() + " different from local "
415            + "generation id " + generationId);
416      }
417      return true;
418    }
419    else if (dsStatus == ServerStatus.FULL_UPDATE_STATUS)
420    {
421      if (logger.isTraceEnabled())
422      {
423        debug("updateMsg " + updateMsg.getCSN()
424            + " will not be sent to directory server "
425            + dsHandler.getServerId() + " as it is in full update");
426      }
427      return true;
428    }
429    return false;
430  }
431
432  private PreparedAssuredInfo getPreparedAssuredInfo(UpdateMsg updateMsg,
433      ServerHandler sourceHandler) throws IOException
434  {
435    // Assured feature is supported starting from replication protocol V2
436    if (!updateMsg.isAssured()
437        || sourceHandler.getProtocolVersion() < REPLICATION_PROTOCOL_V2)
438    {
439      return null;
440    }
441
442    // According to assured sub-mode, prepare structures to keep track of
443    // the acks we are interested in.
444    switch (updateMsg.getAssuredMode())
445    {
446    case SAFE_DATA_MODE:
447      sourceHandler.incrementAssuredSdReceivedUpdates();
448      return processSafeDataUpdateMsg(updateMsg, sourceHandler);
449
450    case SAFE_READ_MODE:
451      sourceHandler.incrementAssuredSrReceivedUpdates();
452      return processSafeReadUpdateMsg(updateMsg, sourceHandler);
453
454    default:
455      // Unknown assured mode: should never happen
456      logger.error(ERR_RS_UNKNOWN_ASSURED_MODE,
457          localReplicationServer.getServerId(), updateMsg.getAssuredMode(), baseDN, updateMsg);
458      return null;
459    }
460  }
461
462  private List<Integer> getAssuredServers(UpdateMsg updateMsg, PreparedAssuredInfo preparedAssuredInfo)
463  {
464    List<Integer> expectedServers = null;
465    if (preparedAssuredInfo != null && preparedAssuredInfo.expectedServers != null)
466    {
467      expectedServers = preparedAssuredInfo.expectedServers;
468      // Store the expected acks info into the global map.
469      // The code for processing reception of acks for this update will update
470      // info kept in this object and if enough acks received, it will send
471      // back the final ack to the requester and remove the object from this map
472      // OR
473      // The following timer will time out and send an timeout ack to the
474      // requester if the acks are not received in time. The timer will also
475      // remove the object from this map.
476      final CSN csn = updateMsg.getCSN();
477      waitingAcks.put(csn, preparedAssuredInfo.expectedAcksInfo);
478
479      // Arm timer for this assured update message (wait for acks until it times out)
480      final AssuredTimeoutTask assuredTimeoutTask = new AssuredTimeoutTask(csn);
481      assuredTimeoutTimer.schedule(assuredTimeoutTask, localReplicationServer.getAssuredTimeout());
482      // Purge timer every 100 treated messages
483      assuredTimeoutTimerPurgeCounter++;
484      if ((assuredTimeoutTimerPurgeCounter % 100) == 0)
485      {
486        assuredTimeoutTimer.purge();
487      }
488    }
489
490    return expectedServers != null ? expectedServers : Collections.<Integer> emptyList();
491  }
492
493  private boolean publishUpdateMsg(UpdateMsg updateMsg)
494  {
495    try
496    {
497      if (updateMsg instanceof ReplicaOfflineMsg)
498      {
499        final ReplicaOfflineMsg offlineMsg = (ReplicaOfflineMsg) updateMsg;
500        this.domainDB.notifyReplicaOffline(baseDN, offlineMsg.getCSN());
501        return true;
502      }
503
504      if (this.domainDB.publishUpdateMsg(baseDN, updateMsg))
505      {
506        /*
507         * JNR: Matt and I had a hard time figuring out where to put this
508         * synchronized block. We elected to put it here, but without a strong
509         * conviction.
510         */
511        synchronized (generationIDLock)
512        {
513          /*
514           * JNR: I think the generationIdSavedStatus is set to true because
515           * method above created a ReplicaDB which assumes the generationId was
516           * communicated to another server. Hence setting true on this field
517           * prevent the generationId from being reset.
518           */
519          generationIdSavedStatus = true;
520        }
521      }
522      return true;
523    }
524    catch (ChangelogException e)
525    {
526      /*
527       * Because of database problem we can't save any more changes from at
528       * least one LDAP server. This replicationServer therefore can't do it's
529       * job properly anymore and needs to close all its connections and
530       * shutdown itself.
531       */
532      logger.error(ERR_CHANGELOG_SHUTDOWN_DATABASE_ERROR, stackTraceToSingleLineString(e));
533      localReplicationServer.shutdown();
534      return false;
535    }
536  }
537
538  private void addUpdate(ServerHandler sHandler, UpdateMsg updateMsg,
539      NotAssuredUpdateMsg notAssuredUpdateMsg, List<Integer> assuredServers)
540  {
541    // Assured mode: post an assured or not assured matching update message
542    // according to what has been computed for the destination server
543    if (notAssuredUpdateMsg != null
544        && !assuredServers.contains(sHandler.getServerId()))
545    {
546      sHandler.add(notAssuredUpdateMsg);
547    }
548    else
549    {
550      sHandler.add(updateMsg);
551    }
552  }
553
554  /**
555   * Helper class to be the return type of a method that processes a just
556   * received assured update message:
557   * - processSafeReadUpdateMsg
558   * - processSafeDataUpdateMsg
559   * This is a facility to pack many interesting returned object.
560   */
561  private class PreparedAssuredInfo
562  {
563      /**
564       * The list of servers identified as servers we are interested in
565       * receiving acks from. If this list is not null, then expectedAcksInfo
566       * should be not null.
567       * Servers that are not in this list are servers not eligible for an ack
568       * request.
569       */
570      public List<Integer> expectedServers;
571
572      /**
573       * The constructed ExpectedAcksInfo object to be used when acks will be
574       * received. Null if expectedServers is null.
575       */
576      public ExpectedAcksInfo expectedAcksInfo;
577  }
578
579  /**
580   * Process a just received assured update message in Safe Read mode. If the
581   * ack can be sent immediately, it is done here. This will also determine to
582   * which suitable servers an ack should be requested from, and which ones are
583   * not eligible for an ack request.
584   * This method is an helper method for the put method. Have a look at the put
585   * method for a better understanding.
586   * @param update The just received assured update to process.
587   * @param sourceHandler The ServerHandler for the server from which the
588   *        update was received
589   * @return A suitable PreparedAssuredInfo object that contains every needed
590   * info to proceed with post to server writers.
591   * @throws IOException When an IO exception happens during the update
592   *         processing.
593   */
594  private PreparedAssuredInfo processSafeReadUpdateMsg(
595    UpdateMsg update, ServerHandler sourceHandler) throws IOException
596  {
597    CSN csn = update.getCSN();
598    byte groupId = localReplicationServer.getGroupId();
599    byte sourceGroupId = sourceHandler.getGroupId();
600    List<Integer> expectedServers = new ArrayList<>();
601    List<Integer> wrongStatusServers = new ArrayList<>();
602
603    if (sourceGroupId == groupId)
604      // Assured feature does not cross different group ids
605    {
606      if (sourceHandler.isDataServer())
607      {
608        collectRSsEligibleForAssuredReplication(groupId, expectedServers);
609      }
610
611      // Look for DS eligible for assured
612      for (DataServerHandler dsHandler : connectedDSs.values())
613      {
614        // Don't forward the change to the server that just sent it
615        if (dsHandler == sourceHandler)
616        {
617          continue;
618        }
619        if (dsHandler.getGroupId() == groupId)
620          // No ack expected from a DS with different group id
621        {
622          ServerStatus serverStatus = dsHandler.getStatus();
623          if (serverStatus == ServerStatus.NORMAL_STATUS)
624          {
625            expectedServers.add(dsHandler.getServerId());
626          } else if (serverStatus == ServerStatus.DEGRADED_STATUS) {
627            // No ack expected from a DS with wrong status
628            wrongStatusServers.add(dsHandler.getServerId());
629          }
630          /*
631           * else
632           * BAD_GEN_ID_STATUS or FULL_UPDATE_STATUS:
633           * We do not want this to be reported as an error to the update
634           * maker -> no pollution or potential misunderstanding when
635           * reading logs or monitoring and it was just administration (for
636           * instance new server is being configured in topo: it goes in bad
637           * gen then full update).
638           */
639        }
640      }
641    }
642
643    // Return computed structures
644    PreparedAssuredInfo preparedAssuredInfo = new PreparedAssuredInfo();
645    if (!expectedServers.isEmpty())
646    {
647      // Some other acks to wait for
648      preparedAssuredInfo.expectedAcksInfo = new SafeReadExpectedAcksInfo(csn,
649        sourceHandler, expectedServers, wrongStatusServers);
650      preparedAssuredInfo.expectedServers = expectedServers;
651    }
652
653    if (preparedAssuredInfo.expectedServers == null)
654    {
655      // No eligible servers found, send the ack immediately
656      sourceHandler.send(new AckMsg(csn));
657    }
658
659    return preparedAssuredInfo;
660  }
661
662  /**
663   * Process a just received assured update message in Safe Data mode. If the
664   * ack can be sent immediately, it is done here. This will also determine to
665   * which suitable servers an ack should be requested from, and which ones are
666   * not eligible for an ack request.
667   * This method is an helper method for the put method. Have a look at the put
668   * method for a better understanding.
669   * @param update The just received assured update to process.
670   * @param sourceHandler The ServerHandler for the server from which the
671   *        update was received
672   * @return A suitable PreparedAssuredInfo object that contains every needed
673   * info to proceed with post to server writers.
674   * @throws IOException When an IO exception happens during the update
675   *         processing.
676   */
677  private PreparedAssuredInfo processSafeDataUpdateMsg(
678    UpdateMsg update, ServerHandler sourceHandler) throws IOException
679  {
680    CSN csn = update.getCSN();
681    boolean interestedInAcks = false;
682    byte safeDataLevel = update.getSafeDataLevel();
683    byte groupId = localReplicationServer.getGroupId();
684    byte sourceGroupId = sourceHandler.getGroupId();
685    if (safeDataLevel < (byte) 1)
686    {
687      // Should never happen
688      logger.error(ERR_UNKNOWN_ASSURED_SAFE_DATA_LEVEL,
689          localReplicationServer.getServerId(), safeDataLevel, baseDN, update);
690    } else if (sourceGroupId == groupId
691    // Assured feature does not cross different group IDS
692        && isSameGenerationId(sourceHandler.getGenerationId()))
693    // Ignore assured updates from wrong generationId servers
694    {
695        if (sourceHandler.isDataServer())
696        {
697          if (safeDataLevel == (byte) 1)
698          {
699            /**
700             * Immediately return the ack for an assured message in safe data
701             * mode with safe data level 1, coming from a DS. No need to wait
702             * for more acks
703             */
704            sourceHandler.send(new AckMsg(csn));
705          } else
706          {
707            /**
708             * level > 1 : We need further acks
709             * The message will be posted in assured mode to eligible
710             * servers. The embedded safe data level is not changed, and his
711             * value will be used by a remote RS to determine if he must send
712             * an ack (level > 1) or not (level = 1)
713             */
714            interestedInAcks = true;
715          }
716        } else
717        { // A RS sent us the safe data message, for sure no further ack to wait
718          /**
719           * Level 1 has already been reached so no further acks to wait.
720           * Just deal with level > 1
721           */
722          if (safeDataLevel > (byte) 1)
723          {
724            sourceHandler.send(new AckMsg(csn));
725          }
726        }
727    }
728
729    List<Integer> expectedServers = new ArrayList<>();
730    if (interestedInAcks && sourceHandler.isDataServer())
731    {
732      collectRSsEligibleForAssuredReplication(groupId, expectedServers);
733    }
734
735    // Return computed structures
736    PreparedAssuredInfo preparedAssuredInfo = new PreparedAssuredInfo();
737    int nExpectedServers = expectedServers.size();
738    if (interestedInAcks) // interestedInAcks so level > 1
739    {
740      if (nExpectedServers > 0)
741      {
742        // Some other acks to wait for
743        int sdl = update.getSafeDataLevel();
744        int neededAdditionalServers = sdl - 1;
745        // Change the number of expected acks if not enough available eligible
746        // servers: the level is a best effort thing, we do not want to timeout
747        // at every assured SD update for instance if a RS has had his gen id
748        // reseted
749        byte finalSdl = (nExpectedServers >= neededAdditionalServers) ?
750          (byte)sdl : // Keep level as it was
751          (byte)(nExpectedServers+1); // Change level to match what's available
752        preparedAssuredInfo.expectedAcksInfo = new SafeDataExpectedAcksInfo(csn,
753          sourceHandler, finalSdl, expectedServers);
754        preparedAssuredInfo.expectedServers = expectedServers;
755      } else
756      {
757        // level > 1 and source is a DS but no eligible servers found, send the
758        // ack immediately
759        sourceHandler.send(new AckMsg(csn));
760      }
761    }
762
763    return preparedAssuredInfo;
764  }
765
766  private void collectRSsEligibleForAssuredReplication(byte groupId,
767      List<Integer> expectedServers)
768  {
769    for (ReplicationServerHandler rsHandler : connectedRSs.values())
770    {
771      if (rsHandler.getGroupId() == groupId
772      // No ack expected from a RS with different group id
773            && isSameGenerationId(rsHandler.getGenerationId())
774        // No ack expected from a RS with bad gen id
775        )
776      {
777        expectedServers.add(rsHandler.getServerId());
778      }
779    }
780  }
781
782  private boolean isSameGenerationId(long generationId)
783  {
784    return this.generationId > 0 && this.generationId == generationId;
785  }
786
787  private boolean isDifferentGenerationId(long generationId)
788  {
789    return this.generationId > 0 && this.generationId != generationId;
790  }
791
792  /**
793   * Process an ack received from a given server.
794   *
795   * @param ack The ack message received.
796   * @param ackingServer The server handler of the server that sent the ack.
797   */
798  void processAck(AckMsg ack, ServerHandler ackingServer)
799  {
800    // Retrieve the expected acks info for the update matching the original
801    // sent update.
802    CSN csn = ack.getCSN();
803    ExpectedAcksInfo expectedAcksInfo = waitingAcks.get(csn);
804
805    if (expectedAcksInfo != null)
806    {
807      // Prevent concurrent access from processAck() or AssuredTimeoutTask.run()
808      synchronized (expectedAcksInfo)
809      {
810        if (expectedAcksInfo.isCompleted())
811        {
812          // Timeout code is sending a timeout ack, do nothing and let him
813          // remove object from the map
814          return;
815        }
816        /**
817         *
818         * If this is the last ack we were waiting from, immediately create and
819         * send the final ack to the original server
820         */
821        if (expectedAcksInfo.processReceivedAck(ackingServer, ack))
822        {
823          // Remove the object from the map as no more needed
824          waitingAcks.remove(csn);
825          AckMsg finalAck = expectedAcksInfo.createAck(false);
826          ServerHandler origServer = expectedAcksInfo.getRequesterServer();
827          try
828          {
829            origServer.send(finalAck);
830          } catch (IOException e)
831          {
832            /**
833             * An error happened trying the send back an ack to the server.
834             * Log an error and close the connection to this server.
835             */
836            LocalizableMessageBuilder mb = new LocalizableMessageBuilder();
837            mb.append(ERR_RS_ERROR_SENDING_ACK.get(
838                localReplicationServer.getServerId(), origServer.getServerId(), csn, baseDN));
839            mb.append(" ");
840            mb.append(stackTraceToSingleLineString(e));
841            logger.error(mb.toMessage());
842            stopServer(origServer, false);
843          }
844          // Mark the ack info object as completed to prevent potential timeout
845          // code parallel run
846          expectedAcksInfo.completed();
847        }
848      }
849    }
850    /* Else the timeout occurred for the update matching this CSN
851     * and the ack with timeout error has probably already been sent.
852     */
853  }
854
855  /**
856   * The code run when the timeout occurs while waiting for acks of the
857   * eligible servers. This basically sends a timeout ack (with any additional
858   * error info) to the original server that sent an assured update message.
859   */
860  private class AssuredTimeoutTask extends TimerTask
861  {
862    private CSN csn;
863
864    /**
865     * Constructor for the timer task.
866     * @param csn The CSN of the assured update we are waiting acks for
867     */
868    public AssuredTimeoutTask(CSN csn)
869    {
870      this.csn = csn;
871    }
872
873    /**
874     * Run when the assured timeout for an assured update message we are waiting
875     * acks for occurs.
876     */
877    @Override
878    public void run()
879    {
880      ExpectedAcksInfo expectedAcksInfo = waitingAcks.get(csn);
881
882      if (expectedAcksInfo != null)
883      {
884        synchronized (expectedAcksInfo)
885        {
886          if (expectedAcksInfo.isCompleted())
887          {
888            // processAck() code is sending the ack, do nothing and let him
889            // remove object from the map
890            return;
891          }
892          // Remove the object from the map as no more needed
893          waitingAcks.remove(csn);
894          // Create the timeout ack and send him to the server the assured
895          // update message came from
896          AckMsg finalAck = expectedAcksInfo.createAck(true);
897          ServerHandler origServer = expectedAcksInfo.getRequesterServer();
898          if (logger.isTraceEnabled())
899          {
900            debug("sending timeout for assured update with CSN " + csn
901                + " to serverId=" + origServer.getServerId());
902          }
903          try
904          {
905            origServer.send(finalAck);
906          } catch (IOException e)
907          {
908            /**
909             * An error happened trying the send back an ack to the server.
910             * Log an error and close the connection to this server.
911             */
912            LocalizableMessageBuilder mb = new LocalizableMessageBuilder();
913            mb.append(ERR_RS_ERROR_SENDING_ACK.get(
914                localReplicationServer.getServerId(), origServer.getServerId(), csn, baseDN));
915            mb.append(" ");
916            mb.append(stackTraceToSingleLineString(e));
917            logger.error(mb.toMessage());
918            stopServer(origServer, false);
919          }
920          // Increment assured counters
921          boolean safeRead =
922              expectedAcksInfo instanceof SafeReadExpectedAcksInfo;
923          if (safeRead)
924          {
925            origServer.incrementAssuredSrReceivedUpdatesTimeout();
926          } else
927          {
928            if (origServer.isDataServer())
929            {
930              origServer.incrementAssuredSdReceivedUpdatesTimeout();
931            }
932          }
933          //   retrieve expected servers in timeout to increment their counter
934          List<Integer> serversInTimeout = expectedAcksInfo.getTimeoutServers();
935          for (Integer serverId : serversInTimeout)
936          {
937            ServerHandler expectedDSInTimeout = connectedDSs.get(serverId);
938            ServerHandler expectedRSInTimeout = connectedRSs.get(serverId);
939            if (expectedDSInTimeout != null)
940            {
941              if (safeRead)
942              {
943                expectedDSInTimeout.incrementAssuredSrSentUpdatesTimeout();
944              } // else no SD update sent to a DS (meaningless)
945            } else if (expectedRSInTimeout != null)
946            {
947              if (safeRead)
948              {
949                expectedRSInTimeout.incrementAssuredSrSentUpdatesTimeout();
950              }
951              else
952              {
953                expectedRSInTimeout.incrementAssuredSdSentUpdatesTimeout();
954              }
955            }
956            // else server disappeared ? Let's forget about it.
957          }
958          // Mark the ack info object as completed to prevent potential
959          // processAck() code parallel run
960          expectedAcksInfo.completed();
961        }
962      }
963    }
964  }
965
966
967  /**
968   * Stop operations with a list of replication servers.
969   *
970   * @param serversToDisconnect
971   *          the replication servers addresses for which we want to stop
972   *          operations
973   */
974  public void stopReplicationServers(Collection<HostPort> serversToDisconnect)
975  {
976    for (ReplicationServerHandler rsHandler : connectedRSs.values())
977    {
978      if (serversToDisconnect.contains(
979            HostPort.valueOf(rsHandler.getServerAddressURL())))
980      {
981        stopServer(rsHandler, false);
982      }
983    }
984  }
985
986  /**
987   * Stop operations with all servers this domain is connected with (RS and DS).
988   *
989   * @param shutdown A boolean indicating if the stop is due to a
990   *                 shutdown condition.
991   */
992  public void stopAllServers(boolean shutdown)
993  {
994    for (ReplicationServerHandler rsHandler : connectedRSs.values())
995    {
996      stopServer(rsHandler, shutdown);
997    }
998
999    for (DataServerHandler dsHandler : connectedDSs.values())
1000    {
1001      stopServer(dsHandler, shutdown);
1002    }
1003  }
1004
1005  /**
1006   * Checks whether it is already connected to a DS with same id.
1007   *
1008   * @param dsHandler
1009   *          the DS we want to check
1010   * @return true if this DS is already connected to the current server
1011   */
1012  public boolean isAlreadyConnectedToDS(DataServerHandler dsHandler)
1013  {
1014    if (connectedDSs.containsKey(dsHandler.getServerId()))
1015    {
1016      // looks like two connected LDAP servers have the same serverId
1017      logger.error(ERR_DUPLICATE_SERVER_ID, localReplicationServer.getMonitorInstanceName(),
1018          connectedDSs.get(dsHandler.getServerId()), dsHandler, dsHandler.getServerId());
1019      return true;
1020    }
1021    return false;
1022  }
1023
1024  /**
1025   * Stop operations with a given server.
1026   *
1027   * @param sHandler the server for which we want to stop operations.
1028   * @param shutdown A boolean indicating if the stop is due to a
1029   *                 shutdown condition.
1030   */
1031  public void stopServer(ServerHandler sHandler, boolean shutdown)
1032  {
1033    // TODO JNR merge with stopServer(MessageHandler)
1034    if (logger.isTraceEnabled())
1035    {
1036      debug("stopServer() on the server handler " + sHandler);
1037    }
1038    /*
1039     * We must prevent deadlock on replication server domain lock, when for
1040     * instance this code is called from dying ServerReader but also dying
1041     * ServerWriter at the same time, or from a thread that wants to shut down
1042     * the handler. So use a thread safe flag to know if the job must be done
1043     * or not (is already being processed or not).
1044     */
1045    if (!sHandler.engageShutdown())
1046      // Only do this once (prevent other thread to enter here again)
1047    {
1048      if (!shutdown)
1049      {
1050        try
1051        {
1052          // Acquire lock on domain (see more details in comment of start()
1053          // method of ServerHandler)
1054          lock();
1055        }
1056        catch (InterruptedException ex)
1057        {
1058          // We can't deal with this here, so re-interrupt thread so that it is
1059          // caught during subsequent IO.
1060          Thread.currentThread().interrupt();
1061          return;
1062        }
1063      }
1064
1065      try
1066      {
1067        // Stop useless monitoring publisher if no more RS or DS in domain
1068        if ( (connectedDSs.size() + connectedRSs.size() )== 1)
1069        {
1070          if (logger.isTraceEnabled())
1071          {
1072            debug("remote server " + sHandler
1073                + " is the last RS/DS to be stopped:"
1074                + " stopping monitoring publisher");
1075          }
1076          stopMonitoringPublisher();
1077        }
1078
1079        if (connectedRSs.containsKey(sHandler.getServerId()))
1080        {
1081          unregisterServerHandler(sHandler, shutdown, false);
1082        }
1083        else if (connectedDSs.containsKey(sHandler.getServerId()))
1084        {
1085          unregisterServerHandler(sHandler, shutdown, true);
1086        }
1087      }
1088      catch(Exception e)
1089      {
1090        logger.error(LocalizableMessage.raw(stackTraceToSingleLineString(e)));
1091      }
1092      finally
1093      {
1094        if (!shutdown)
1095        {
1096          release();
1097        }
1098      }
1099    }
1100  }
1101
1102  private void unregisterServerHandler(ServerHandler sHandler, boolean shutdown,
1103      boolean isDirectoryServer)
1104  {
1105    unregisterServerHandler(sHandler);
1106    sHandler.shutdown();
1107
1108    resetGenerationIdIfPossible();
1109    if (!shutdown)
1110    {
1111      synchronized (pendingStatusMessagesLock)
1112      {
1113        if (isDirectoryServer)
1114        {
1115          // Update the remote replication servers with our list
1116          // of connected LDAP servers
1117          pendingStatusMessages.enqueueTopoInfoToAllRSs();
1118        }
1119        // Warn our DSs that a RS or DS has quit (does not use this
1120        // handler as already removed from list)
1121        pendingStatusMessages.enqueueTopoInfoToAllDSsExcept(null);
1122      }
1123      statusAnalyzer.notifyPendingStatusMessage();
1124    }
1125  }
1126
1127  /**
1128   * Unregister this handler from the list of handlers registered to this
1129   * domain.
1130   * @param sHandler the provided handler to unregister.
1131   */
1132  private void unregisterServerHandler(ServerHandler sHandler)
1133  {
1134    if (sHandler.isReplicationServer())
1135    {
1136      connectedRSs.remove(sHandler.getServerId());
1137    }
1138    else
1139    {
1140      connectedDSs.remove(sHandler.getServerId());
1141    }
1142  }
1143
1144  /**
1145   * This method resets the generationId for this domain if there is no LDAP
1146   * server currently connected in the whole topology on this domain and if the
1147   * generationId has never been saved.
1148   * <ul>
1149   * <li>test emptiness of {@link #connectedDSs} list</li>
1150   * <li>traverse {@link #connectedRSs} list and test for each if DS are
1151   * connected</li>
1152   * </ul>
1153   * So it strongly relies on the {@link #connectedDSs} list
1154   */
1155  private void resetGenerationIdIfPossible()
1156  {
1157    if (logger.isTraceEnabled())
1158    {
1159      debug("mayResetGenerationId generationIdSavedStatus="
1160          + generationIdSavedStatus);
1161    }
1162
1163    // If there is no more any LDAP server connected to this domain in the
1164    // topology and the generationId has never been saved, then we can reset
1165    // it and the next LDAP server to connect will become the new reference.
1166    boolean ldapServersConnectedInTheTopology = false;
1167    if (connectedDSs.isEmpty())
1168    {
1169      for (ReplicationServerHandler rsHandler : connectedRSs.values())
1170      {
1171        if (generationId != rsHandler.getGenerationId())
1172        {
1173          if (logger.isTraceEnabled())
1174          {
1175            debug("mayResetGenerationId skip RS " + rsHandler
1176                + " that has different genId");
1177          }
1178        }
1179        else if (rsHandler.hasRemoteLDAPServers())
1180        {
1181          ldapServersConnectedInTheTopology = true;
1182
1183          if (logger.isTraceEnabled())
1184          {
1185            debug("mayResetGenerationId RS " + rsHandler
1186                + " has ldap servers connected to it"
1187                + " - will not reset generationId");
1188          }
1189          break;
1190        }
1191      }
1192    }
1193    else
1194    {
1195      ldapServersConnectedInTheTopology = true;
1196
1197      if (logger.isTraceEnabled())
1198      {
1199        debug("has ldap servers connected to it - will not reset generationId");
1200      }
1201    }
1202
1203    if (!ldapServersConnectedInTheTopology
1204        && !generationIdSavedStatus
1205        && generationId != -1)
1206    {
1207      changeGenerationId(-1);
1208    }
1209  }
1210
1211  /**
1212   * Checks whether a remote RS is already connected to this hosting RS.
1213   *
1214   * @param rsHandler
1215   *          The handler for the remote RS.
1216   * @return flag specifying whether the remote RS is already connected.
1217   * @throws DirectoryException
1218   *           when a problem occurs.
1219   */
1220  public boolean isAlreadyConnectedToRS(ReplicationServerHandler rsHandler)
1221      throws DirectoryException
1222  {
1223    ReplicationServerHandler oldRsHandler =
1224        connectedRSs.get(rsHandler.getServerId());
1225    if (oldRsHandler == null)
1226    {
1227      return false;
1228    }
1229
1230    if (oldRsHandler.getServerAddressURL().equals(
1231        rsHandler.getServerAddressURL()))
1232    {
1233      // this is the same server, this means that our ServerStart messages
1234      // have been sent at about the same time and 2 connections
1235      // have been established.
1236      // Silently drop this connection.
1237      return true;
1238    }
1239
1240    // looks like two replication servers have the same serverId
1241    // log an error message and drop this connection.
1242    LocalizableMessage message = ERR_DUPLICATE_REPLICATION_SERVER_ID.get(
1243        localReplicationServer.getMonitorInstanceName(),
1244        oldRsHandler.getServerAddressURL(), rsHandler.getServerAddressURL(),
1245        rsHandler.getServerId());
1246    throw new DirectoryException(ResultCode.OTHER, message);
1247  }
1248
1249  /**
1250   * Creates and returns a cursor across this replication domain.
1251   * <p>
1252   * Client code must call {@link DBCursor#next()} to advance the cursor to the
1253   * next available record.
1254   * <p>
1255   * When the cursor is not used anymore, client code MUST call the
1256   * {@link DBCursor#close()} method to free the resources and locks used by the
1257   * cursor.
1258   *
1259   * @param startAfterServerState
1260   *          Starting point for the replicaDB cursors. If null, start from the
1261   *          oldest CSN
1262   * @return a non null {@link DBCursor} going from oldest to newest CSN
1263   * @throws ChangelogException
1264   *           If a database problem happened
1265   * @see ReplicationDomainDB#getCursorFrom(DN, ServerState, CursorOptions)
1266   */
1267  public DBCursor<UpdateMsg> getCursorFrom(ServerState startAfterServerState)
1268      throws ChangelogException
1269  {
1270    CursorOptions options = new CursorOptions(GREATER_THAN_OR_EQUAL_TO_KEY, AFTER_MATCHING_KEY);
1271    return domainDB.getCursorFrom(baseDN, startAfterServerState, options);
1272  }
1273
1274  /**
1275   * Get the baseDN.
1276   *
1277   * @return Returns the baseDN.
1278   */
1279  public DN getBaseDN()
1280  {
1281    return baseDN;
1282  }
1283
1284  /**
1285   * Retrieves the destination handlers for a routable message.
1286   *
1287   * @param msg The message to route.
1288   * @param senderHandler The handler of the server that published this message.
1289   * @return The list of destination handlers.
1290   */
1291  private List<ServerHandler> getDestinationServers(RoutableMsg msg,
1292    ServerHandler senderHandler)
1293  {
1294    List<ServerHandler> servers = new ArrayList<>();
1295
1296    if (msg.getDestination() == RoutableMsg.THE_CLOSEST_SERVER)
1297    {
1298      // TODO Import from the "closest server" to be implemented
1299    } else if (msg.getDestination() == RoutableMsg.ALL_SERVERS)
1300    {
1301      if (!senderHandler.isReplicationServer())
1302      {
1303        // Send to all replication servers with a least one remote
1304        // server connected
1305        for (ReplicationServerHandler rsh : connectedRSs.values())
1306        {
1307          if (rsh.hasRemoteLDAPServers())
1308          {
1309            servers.add(rsh);
1310          }
1311        }
1312      }
1313
1314      // Sends to all connected LDAP servers
1315      for (DataServerHandler destinationHandler : connectedDSs.values())
1316      {
1317        // Don't loop on the sender
1318        if (destinationHandler == senderHandler)
1319        {
1320          continue;
1321        }
1322        servers.add(destinationHandler);
1323      }
1324    } else
1325    {
1326      // Destination is one server
1327      DataServerHandler destinationHandler =
1328        connectedDSs.get(msg.getDestination());
1329      if (destinationHandler != null)
1330      {
1331        servers.add(destinationHandler);
1332      } else
1333      {
1334        // the targeted server is NOT connected
1335        // Let's search for the replication server that MAY
1336        // have the targeted server connected.
1337        if (senderHandler.isDataServer())
1338        {
1339          for (ReplicationServerHandler rsHandler : connectedRSs.values())
1340          {
1341            // Send to all replication servers with a least one remote
1342            // server connected
1343            if (rsHandler.isRemoteLDAPServer(msg.getDestination()))
1344            {
1345              servers.add(rsHandler);
1346            }
1347          }
1348        }
1349      }
1350    }
1351    return servers;
1352  }
1353
1354
1355
1356  /**
1357   * Processes a message coming from one server in the topology and potentially
1358   * forwards it to one or all other servers.
1359   *
1360   * @param msg
1361   *          The message received and to be processed.
1362   * @param sender
1363   *          The server handler of the server that sent the message.
1364   */
1365  void process(RoutableMsg msg, ServerHandler sender)
1366  {
1367    if (msg.getDestination() == localReplicationServer.getServerId())
1368    {
1369      // Handle routable messages targeted at this RS.
1370      if (msg instanceof ErrorMsg)
1371      {
1372        ErrorMsg errorMsg = (ErrorMsg) msg;
1373        logger.error(ERR_ERROR_MSG_RECEIVED, errorMsg.getDetails());
1374      }
1375      else
1376      {
1377        replyWithUnroutableMsgType(sender, msg);
1378      }
1379    }
1380    else
1381    {
1382      // Forward message not destined for this RS.
1383      List<ServerHandler> servers = getDestinationServers(msg, sender);
1384      if (!servers.isEmpty())
1385      {
1386        forwardMsgToAllServers(msg, servers, sender);
1387      }
1388      else
1389      {
1390        replyWithUnreachablePeerMsg(sender, msg);
1391      }
1392    }
1393  }
1394
1395  /**
1396   * Responds to a monitor request message.
1397   *
1398   * @param msg
1399   *          The monitor request message.
1400   * @param sender
1401   *          The DS/RS which sent the monitor request.
1402   */
1403  void processMonitorRequestMsg(MonitorRequestMsg msg, ServerHandler sender)
1404  {
1405    enqueueMonitorMsg(msg, sender);
1406  }
1407
1408  /**
1409   * Responds to a monitor message.
1410   *
1411   * @param msg
1412   *          The monitor message
1413   * @param sender
1414   *          The DS/RS which sent the monitor.
1415   */
1416  void processMonitorMsg(MonitorMsg msg, ServerHandler sender)
1417  {
1418    domainMonitor.receiveMonitorDataResponse(msg, sender.getServerId());
1419  }
1420
1421  private void replyWithUnroutableMsgType(ServerHandler msgEmitter,
1422      RoutableMsg msg)
1423  {
1424    String msgClassname = msg.getClass().getCanonicalName();
1425    logger.info(NOTE_ERR_ROUTING_TO_SERVER, msgClassname);
1426
1427    LocalizableMessageBuilder mb = new LocalizableMessageBuilder();
1428    mb.append(NOTE_ERR_ROUTING_TO_SERVER.get(msgClassname));
1429    mb.append("serverID:").append(msg.getDestination());
1430    ErrorMsg errMsg = new ErrorMsg(msg.getSenderID(), mb.toMessage());
1431    try
1432    {
1433      msgEmitter.send(errMsg);
1434    }
1435    catch (IOException ignored)
1436    {
1437      // an error happened on the sender session trying to recover
1438      // from an error on the receiver session.
1439      // Not much more we can do at this point.
1440    }
1441  }
1442
1443  private void replyWithUnreachablePeerMsg(ServerHandler msgEmitter,
1444      RoutableMsg msg)
1445  {
1446    LocalizableMessageBuilder mb = new LocalizableMessageBuilder();
1447    mb.append(ERR_NO_REACHABLE_PEER_IN_THE_DOMAIN.get(baseDN, msg.getDestination()));
1448    mb.append(" In Replication Server=").append(
1449      this.localReplicationServer.getMonitorInstanceName());
1450    mb.append(" unroutable message =").append(msg.getClass().getSimpleName());
1451    mb.append(" Details:routing table is empty");
1452    final LocalizableMessage message = mb.toMessage();
1453    logger.error(message);
1454
1455    ErrorMsg errMsg = new ErrorMsg(this.localReplicationServer.getServerId(),
1456        msg.getSenderID(), message);
1457    try
1458    {
1459      msgEmitter.send(errMsg);
1460    }
1461    catch (IOException ignored)
1462    {
1463      // TODO Handle error properly (sender timeout in addition)
1464      /*
1465       * An error happened trying to send an error msg to this server.
1466       * Log an error and close the connection to this server.
1467       */
1468      logger.error(ERR_CHANGELOG_ERROR_SENDING_ERROR, this, ignored);
1469      stopServer(msgEmitter, false);
1470    }
1471  }
1472
1473  private void forwardMsgToAllServers(RoutableMsg msg,
1474      List<ServerHandler> servers, ServerHandler sender)
1475  {
1476    for (ServerHandler targetHandler : servers)
1477    {
1478      try
1479      {
1480        targetHandler.send(msg);
1481      } catch (IOException ioe)
1482      {
1483        /*
1484         * An error happened trying to send a routable message to its
1485         * destination server.
1486         * Send back an error to the originator of the message.
1487         */
1488        LocalizableMessageBuilder mb = new LocalizableMessageBuilder();
1489        mb.append(ERR_NO_REACHABLE_PEER_IN_THE_DOMAIN.get(baseDN, msg.getDestination()));
1490        mb.append(" unroutable message =").append(msg.getClass().getSimpleName());
1491        mb.append(" Details: ").append(ioe.getLocalizedMessage());
1492        final LocalizableMessage message = mb.toMessage();
1493        logger.error(message);
1494
1495        ErrorMsg errMsg = new ErrorMsg(msg.getSenderID(), message);
1496        try
1497        {
1498          sender.send(errMsg);
1499        } catch (IOException ioe1)
1500        {
1501          // an error happened on the sender session trying to recover
1502          // from an error on the receiver session.
1503          // We don't have much solution left beside closing the sessions.
1504          stopServer(sender, false);
1505          stopServer(targetHandler, false);
1506        }
1507      // TODO Handle error properly (sender timeout in addition)
1508      }
1509    }
1510  }
1511
1512  /**
1513   * Creates a new monitor message including monitoring information for the
1514   * whole topology.
1515   *
1516   * @param sender
1517   *          The sender of this message.
1518   * @param destination
1519   *          The destination of this message.
1520   * @return The newly created and filled MonitorMsg. Null if a problem occurred
1521   *         during message creation.
1522   * @throws InterruptedException
1523   *           if this thread is interrupted while waiting for a response
1524   */
1525  public MonitorMsg createGlobalTopologyMonitorMsg(int sender, int destination)
1526      throws InterruptedException
1527  {
1528    return createGlobalTopologyMonitorMsg(sender, destination,
1529        domainMonitor.recomputeMonitorData());
1530  }
1531
1532  private MonitorMsg createGlobalTopologyMonitorMsg(int sender,
1533      int destination, ReplicationDomainMonitorData monitorData)
1534  {
1535    final MonitorMsg returnMsg = new MonitorMsg(sender, destination);
1536    returnMsg.setReplServerDbState(getLatestServerState());
1537
1538    // Add the server state for each DS and RS currently in the topology.
1539    for (int replicaId : toIterable(monitorData.ldapIterator()))
1540    {
1541      returnMsg.setServerState(replicaId,
1542          monitorData.getLDAPServerState(replicaId),
1543          monitorData.getApproxFirstMissingDate(replicaId), true);
1544    }
1545
1546    for (int replicaId : toIterable(monitorData.rsIterator()))
1547    {
1548      returnMsg.setServerState(replicaId,
1549          monitorData.getRSStates(replicaId),
1550          monitorData.getRSApproxFirstMissingDate(replicaId), false);
1551    }
1552
1553    return returnMsg;
1554  }
1555
1556
1557
1558  /**
1559   * Creates a new monitor message including monitoring information for the
1560   * topology directly connected to this RS. This includes information for: -
1561   * local RS - all direct DSs - all direct RSs
1562   *
1563   * @param sender
1564   *          The sender of this message.
1565   * @param destination
1566   *          The destination of this message.
1567   * @return The newly created and filled MonitorMsg. Null if the current thread
1568   *         was interrupted while attempting to get the domain lock.
1569   */
1570  private MonitorMsg createLocalTopologyMonitorMsg(int sender, int destination)
1571  {
1572    final MonitorMsg monitorMsg = new MonitorMsg(sender, destination);
1573    monitorMsg.setReplServerDbState(getLatestServerState());
1574
1575    // Add the server state for each connected DS and RS.
1576    for (DataServerHandler dsHandler : this.connectedDSs.values())
1577    {
1578      monitorMsg.setServerState(dsHandler.getServerId(),
1579          dsHandler.getServerState(), dsHandler.getApproxFirstMissingDate(),
1580          true);
1581    }
1582
1583    for (ReplicationServerHandler rsHandler : this.connectedRSs.values())
1584    {
1585      monitorMsg.setServerState(rsHandler.getServerId(),
1586          rsHandler.getServerState(), rsHandler.getApproxFirstMissingDate(),
1587          false);
1588    }
1589    return monitorMsg;
1590  }
1591
1592  /**
1593   * Shutdown this ReplicationServerDomain.
1594   */
1595  public void shutdown()
1596  {
1597    DirectoryServer.deregisterMonitorProvider(this);
1598
1599    // Terminate the assured timer
1600    assuredTimeoutTimer.cancel();
1601
1602    stopAllServers(true);
1603    statusAnalyzer.shutdown();
1604  }
1605
1606  /**
1607   * Returns the latest most current ServerState describing the newest CSNs for
1608   * each server in this domain.
1609   *
1610   * @return The ServerState describing the newest CSNs for each server in in
1611   *         this domain.
1612   */
1613  public ServerState getLatestServerState()
1614  {
1615    return domainDB.getDomainNewestCSNs(baseDN);
1616  }
1617
1618  /** {@inheritDoc} */
1619  @Override
1620  public String toString()
1621  {
1622    return "ReplicationServerDomain " + baseDN;
1623  }
1624
1625
1626
1627  /**
1628   * Creates a TopologyMsg filled with information to be sent to a remote RS.
1629   * We send remote RS the info of every DS that are directly connected to us
1630   * plus our own info as RS.
1631   * @return A suitable TopologyMsg PDU to be sent to a peer RS
1632   */
1633  public TopologyMsg createTopologyMsgForRS()
1634  {
1635    List<DSInfo> dsInfos = new ArrayList<>();
1636    for (DataServerHandler dsHandler : connectedDSs.values())
1637    {
1638      dsInfos.add(dsHandler.toDSInfo());
1639    }
1640
1641    // Create info for the local RS
1642    List<RSInfo> rsInfos = newArrayList(toRSInfo(localReplicationServer, generationId));
1643
1644    return new TopologyMsg(dsInfos, rsInfos);
1645  }
1646
1647  /**
1648   * Creates a TopologyMsg filled with information to be sent to a DS.
1649   * We send remote DS the info of every known DS and RS in the topology (our
1650   * directly connected DSs plus the DSs connected to other RSs) except himself.
1651   * Also put info related to local RS.
1652   *
1653   * @param destDsId The id of the DS the TopologyMsg PDU is to be sent to and
1654   * that we must not include in the DS list.
1655   * @return A suitable TopologyMsg PDU to be sent to a peer DS
1656   */
1657  public TopologyMsg createTopologyMsgForDS(int destDsId)
1658  {
1659    // Go through every DSs (except recipient of msg)
1660    List<DSInfo> dsInfos = new ArrayList<>();
1661    for (DataServerHandler dsHandler : connectedDSs.values())
1662    {
1663      if (dsHandler.getServerId() == destDsId)
1664      {
1665        continue;
1666      }
1667      dsInfos.add(dsHandler.toDSInfo());
1668    }
1669
1670
1671    List<RSInfo> rsInfos = new ArrayList<>();
1672    // Add our own info (local RS)
1673    rsInfos.add(toRSInfo(localReplicationServer, generationId));
1674
1675    // Go through every peer RSs (and get their connected DSs), also add info
1676    // for RSs
1677    for (ReplicationServerHandler rsHandler : connectedRSs.values())
1678    {
1679      rsInfos.add(rsHandler.toRSInfo());
1680
1681      rsHandler.addDSInfos(dsInfos);
1682    }
1683
1684    return new TopologyMsg(dsInfos, rsInfos);
1685  }
1686
1687  private RSInfo toRSInfo(ReplicationServer rs, long generationId)
1688  {
1689    return new RSInfo(rs.getServerId(), rs.getServerURL(), generationId,
1690        rs.getGroupId(), rs.getWeight());
1691  }
1692
1693  /**
1694   * Get the generationId associated to this domain.
1695   *
1696   * @return The generationId
1697   */
1698  public long getGenerationId()
1699  {
1700    return generationId;
1701  }
1702
1703  /**
1704   * Initialize the value of the generationID for this ReplicationServerDomain.
1705   * This method is intended to be used for initialization at startup and
1706   * simply stores the new value without any additional processing.
1707   * For example it does not clear the change-log DBs
1708   *
1709   * @param generationId The new value of generationId.
1710   */
1711  public void initGenerationID(long generationId)
1712  {
1713    synchronized (generationIDLock)
1714    {
1715      this.generationId = generationId;
1716      this.generationIdSavedStatus = true;
1717    }
1718  }
1719
1720  /**
1721   * Sets the provided value as the new in memory generationId.
1722   * Also clear the changelog databases.
1723   *
1724   * @param generationId The new value of generationId.
1725   * @return The old generation id
1726   */
1727  public long changeGenerationId(long generationId)
1728  {
1729    synchronized (generationIDLock)
1730    {
1731      long oldGenerationId = this.generationId;
1732
1733      if (this.generationId != generationId)
1734      {
1735        clearDbs();
1736
1737        this.generationId = generationId;
1738        this.generationIdSavedStatus = false;
1739      }
1740      return oldGenerationId;
1741    }
1742  }
1743
1744  /**
1745   * Resets the generationID.
1746   *
1747   * @param senderHandler The handler associated to the server
1748   *        that requested to reset the generationId.
1749   * @param genIdMsg The reset generation ID msg received.
1750   */
1751  public void resetGenerationId(ServerHandler senderHandler,
1752    ResetGenerationIdMsg genIdMsg)
1753  {
1754    if (logger.isTraceEnabled())
1755    {
1756      debug("Receiving ResetGenerationIdMsg from "
1757          + senderHandler.getServerId() + ":\n" + genIdMsg);
1758    }
1759
1760    try
1761    {
1762      // Acquire lock on domain (see more details in comment of start() method
1763      // of ServerHandler)
1764      lock();
1765    }
1766    catch (InterruptedException ex)
1767    {
1768      // We can't deal with this here, so re-interrupt thread so that it is
1769      // caught during subsequent IO.
1770      Thread.currentThread().interrupt();
1771      return;
1772    }
1773
1774    try
1775    {
1776      final long newGenId = genIdMsg.getGenerationId();
1777      if (newGenId != this.generationId)
1778      {
1779        changeGenerationId(newGenId);
1780      }
1781      else
1782      {
1783        // Order to take a gen id we already have, just ignore
1784        if (logger.isTraceEnabled())
1785        {
1786          debug("Reset generation id requested but generationId was already "
1787              + this.generationId + ":\n" + genIdMsg);
1788        }
1789      }
1790
1791      // If we are the first replication server warned,
1792      // then forwards the reset message to the remote replication servers
1793      for (ServerHandler rsHandler : connectedRSs.values())
1794      {
1795        try
1796        {
1797          // After we'll have sent the message , the remote RS will adopt
1798          // the new genId
1799          rsHandler.setGenerationId(newGenId);
1800          if (senderHandler.isDataServer())
1801          {
1802            rsHandler.send(genIdMsg);
1803          }
1804        } catch (IOException e)
1805        {
1806          logger.error(ERR_EXCEPTION_FORWARDING_RESET_GEN_ID, baseDN, e.getMessage());
1807        }
1808      }
1809
1810      // Change status of the connected DSs according to the requested new
1811      // reference generation id
1812      for (DataServerHandler dsHandler : connectedDSs.values())
1813      {
1814        try
1815        {
1816          dsHandler.changeStatusForResetGenId(newGenId);
1817        } catch (IOException e)
1818        {
1819          logger.error(ERR_EXCEPTION_CHANGING_STATUS_AFTER_RESET_GEN_ID, baseDN,
1820              dsHandler.getServerId(), e.getMessage());
1821        }
1822      }
1823
1824      // Update every peers (RS/DS) with potential topology changes (status
1825      // change). Rather than doing that each time a DS has a status change
1826      // (consecutive to reset gen id message), we prefer advertising once for
1827      // all after changes (less packet sent), here at the end of the reset msg
1828      // treatment.
1829      sendTopoInfoToAll();
1830
1831      logger.info(NOTE_RESET_GENERATION_ID, baseDN, newGenId);
1832    }
1833    catch(Exception e)
1834    {
1835      logger.error(LocalizableMessage.raw(stackTraceToSingleLineString(e)));
1836    }
1837    finally
1838    {
1839      release();
1840    }
1841  }
1842
1843  /**
1844   * Process message of a remote server changing his status.
1845   * @param senderHandler The handler associated to the server
1846   *        that changed his status.
1847   * @param csMsg The message containing the new status
1848   */
1849  public void processNewStatus(DataServerHandler senderHandler,
1850    ChangeStatusMsg csMsg)
1851  {
1852    if (logger.isTraceEnabled())
1853    {
1854      debug("receiving ChangeStatusMsg from " + senderHandler.getServerId()
1855          + ":\n" + csMsg);
1856    }
1857
1858    try
1859    {
1860      // Acquire lock on domain (see more details in comment of start() method
1861      // of ServerHandler)
1862      lock();
1863    }
1864    catch (InterruptedException ex)
1865    {
1866      // We can't deal with this here, so re-interrupt thread so that it is
1867      // caught during subsequent IO.
1868      Thread.currentThread().interrupt();
1869      return;
1870    }
1871
1872    try
1873    {
1874      ServerStatus newStatus = senderHandler.processNewStatus(csMsg);
1875      if (newStatus == ServerStatus.INVALID_STATUS)
1876      {
1877        // Already logged an error in processNewStatus()
1878        // just return not to forward a bad status to topology
1879        return;
1880      }
1881
1882      enqueueTopoInfoToAllExcept(senderHandler);
1883
1884      logger.info(NOTE_DIRECTORY_SERVER_CHANGED_STATUS,
1885          senderHandler.getServerId(), baseDN, newStatus);
1886    }
1887    catch(Exception e)
1888    {
1889      logger.error(LocalizableMessage.raw(stackTraceToSingleLineString(e)));
1890    }
1891    finally
1892    {
1893      release();
1894    }
1895  }
1896
1897  /**
1898   * Change the status of a directory server according to the event generated
1899   * from the status analyzer.
1900   * @param dsHandler The handler of the directory server to update
1901   * @param event The event to be used for new status computation
1902   * @return True if we have been interrupted (must stop), false otherwise
1903   */
1904  private boolean changeStatus(DataServerHandler dsHandler,
1905      StatusMachineEvent event)
1906  {
1907    try
1908    {
1909      // Acquire lock on domain (see ServerHandler#start() for more details)
1910      lock();
1911    }
1912    catch (InterruptedException ex)
1913    {
1914      // We have been interrupted for dying, from stopStatusAnalyzer
1915      // to prevent deadlock in this situation:
1916      // RS is being shutdown, and stopServer will call stopStatusAnalyzer.
1917      // Domain lock is taken by shutdown thread while status analyzer thread
1918      // is willing to change the status of a server at the same time so is
1919      // waiting for the domain lock at the same time. As shutdown thread is
1920      // waiting for analyzer thread death, a deadlock occurs. So we force
1921      // interruption of the status analyzer thread death after 2 seconds if
1922      // it has not finished (see StatusAnalyzer.waitForShutdown). This allows
1923      // to have the analyzer thread taking the domain lock only when the
1924      // status of a DS has to be changed. See more comments in run method of
1925      // StatusAnalyzer.
1926      if (logger.isTraceEnabled())
1927      {
1928        logger.trace("Status analyzer for domain " + baseDN
1929            + " has been interrupted when"
1930            + " trying to acquire domain lock for changing the status of DS "
1931            + dsHandler.getServerId());
1932      }
1933      return true;
1934    }
1935
1936    try
1937    {
1938      ServerStatus newStatus = ServerStatus.INVALID_STATUS;
1939      ServerStatus oldStatus = dsHandler.getStatus();
1940      try
1941      {
1942        newStatus = dsHandler.changeStatus(event);
1943      }
1944      catch (IOException e)
1945      {
1946        logger.error(ERR_EXCEPTION_CHANGING_STATUS_FROM_STATUS_ANALYZER,
1947            baseDN, dsHandler.getServerId(), e.getMessage());
1948      }
1949
1950      if (newStatus == ServerStatus.INVALID_STATUS || newStatus == oldStatus)
1951      {
1952        // Change was impossible or already occurred (see StatusAnalyzer
1953        // comments)
1954        return false;
1955      }
1956
1957      enqueueTopoInfoToAllExcept(dsHandler);
1958    }
1959    catch (Exception e)
1960    {
1961      logger.error(LocalizableMessage.raw(stackTraceToSingleLineString(e)));
1962    }
1963    finally
1964    {
1965      release();
1966    }
1967
1968    return false;
1969  }
1970
1971  /**
1972   * Update every peers (RS/DS) with topology changes.
1973   */
1974  public void sendTopoInfoToAll()
1975  {
1976    enqueueTopoInfoToAllExcept(null);
1977  }
1978
1979  /**
1980   * Update every peers (RS/DS) with topology changes but one DS.
1981   *
1982   * @param dsHandler
1983   *          if not null, the topology message will not be sent to this DS
1984   */
1985  private void enqueueTopoInfoToAllExcept(DataServerHandler dsHandler)
1986  {
1987    synchronized (pendingStatusMessagesLock)
1988    {
1989      pendingStatusMessages.enqueueTopoInfoToAllDSsExcept(dsHandler);
1990      pendingStatusMessages.enqueueTopoInfoToAllRSs();
1991    }
1992    statusAnalyzer.notifyPendingStatusMessage();
1993  }
1994
1995  /**
1996   * Clears the Db associated with that domain.
1997   */
1998  private void clearDbs()
1999  {
2000    try
2001    {
2002      domainDB.removeDomain(baseDN);
2003    }
2004    catch (ChangelogException e)
2005    {
2006      logger.error(ERR_ERROR_CLEARING_DB, baseDN, e.getMessage(), e);
2007    }
2008  }
2009
2010  /**
2011   * Returns whether the provided server is in degraded
2012   * state due to the fact that the peer server has an invalid
2013   * generationId for this domain.
2014   *
2015   * @param serverId The serverId for which we want to know the
2016   *                 the state.
2017   * @return Whether it is degraded or not.
2018   */
2019  public boolean isDegradedDueToGenerationId(int serverId)
2020  {
2021    if (logger.isTraceEnabled())
2022    {
2023      debug("isDegraded serverId=" + serverId + " given local generation Id="
2024          + this.generationId);
2025    }
2026
2027    ServerHandler sHandler = connectedRSs.get(serverId);
2028    if (sHandler == null)
2029    {
2030      sHandler = connectedDSs.get(serverId);
2031      if (sHandler == null)
2032      {
2033        return false;
2034      }
2035    }
2036
2037    if (logger.isTraceEnabled())
2038    {
2039      debug("Compute degradation of serverId=" + serverId
2040          + " LS server generation Id=" + sHandler.getGenerationId());
2041    }
2042    return sHandler.getGenerationId() != this.generationId;
2043  }
2044
2045  /**
2046   * Process topology information received from a peer RS.
2047   * @param topoMsg The just received topo message from remote RS
2048   * @param rsHandler The handler that received the message.
2049   * @param allowResetGenId True for allowing to reset the generation id (
2050   * when called after initial handshake)
2051   * @throws IOException If an error occurred.
2052   * @throws DirectoryException If an error occurred.
2053   */
2054  public void receiveTopoInfoFromRS(TopologyMsg topoMsg,
2055      ReplicationServerHandler rsHandler, boolean allowResetGenId)
2056      throws IOException, DirectoryException
2057  {
2058    if (logger.isTraceEnabled())
2059    {
2060      debug("receiving TopologyMsg from serverId=" + rsHandler.getServerId()
2061          + ":\n" + topoMsg);
2062    }
2063
2064    try
2065    {
2066      // Acquire lock on domain (see more details in comment of start() method
2067      // of ServerHandler)
2068      lock();
2069    }
2070    catch (InterruptedException ex)
2071    {
2072      // We can't deal with this here, so re-interrupt thread so that it is
2073      // caught during subsequent IO.
2074      Thread.currentThread().interrupt();
2075      return;
2076    }
2077
2078    try
2079    {
2080      // Store DS connected to remote RS & update information about the peer RS
2081      rsHandler.processTopoInfoFromRS(topoMsg);
2082
2083      // Handle generation id
2084      if (allowResetGenId)
2085      {
2086        resetGenerationIdIfPossible();
2087        setGenerationIdIfUnset(rsHandler.getGenerationId());
2088      }
2089
2090      if (isDifferentGenerationId(rsHandler.getGenerationId()))
2091      {
2092        LocalizableMessage message = WARN_BAD_GENERATION_ID_FROM_RS.get(rsHandler.getServerId(),
2093            rsHandler.session.getReadableRemoteAddress(), rsHandler.getGenerationId(),
2094            baseDN, getLocalRSServerId(), generationId);
2095        logger.warn(message);
2096
2097        ErrorMsg errorMsg = new ErrorMsg(getLocalRSServerId(),
2098            rsHandler.getServerId(), message);
2099        rsHandler.send(errorMsg);
2100      }
2101
2102      /*
2103       * Sends the currently known topology information to every connected
2104       * DS we have.
2105       */
2106      synchronized (pendingStatusMessagesLock)
2107      {
2108        pendingStatusMessages.enqueueTopoInfoToAllDSsExcept(null);
2109      }
2110      statusAnalyzer.notifyPendingStatusMessage();
2111    }
2112    catch(Exception e)
2113    {
2114      logger.error(LocalizableMessage.raw(stackTraceToSingleLineString(e)));
2115    }
2116    finally
2117    {
2118      release();
2119    }
2120  }
2121
2122  private void setGenerationIdIfUnset(long generationId)
2123  {
2124    if (this.generationId < 0)
2125    {
2126      this.generationId = generationId;
2127    }
2128  }
2129
2130  /**
2131   * Returns the latest monitor data available for this replication server
2132   * domain.
2133   *
2134   * @return The latest monitor data available for this replication server
2135   *         domain, which is never {@code null}.
2136   */
2137  ReplicationDomainMonitorData getDomainMonitorData()
2138  {
2139    return domainMonitor.getMonitorData();
2140  }
2141
2142  /**
2143   * Get the map of connected DSs.
2144   * @return The map of connected DSs
2145   */
2146  public Map<Integer, DataServerHandler> getConnectedDSs()
2147  {
2148    return Collections.unmodifiableMap(connectedDSs);
2149  }
2150
2151  /**
2152   * Get the map of connected RSs.
2153   * @return The map of connected RSs
2154   */
2155  public Map<Integer, ReplicationServerHandler> getConnectedRSs()
2156  {
2157    return Collections.unmodifiableMap(connectedRSs);
2158  }
2159
2160
2161  /**
2162   * A synchronization mechanism is created to insure exclusive access to the
2163   * domain. The goal is to have a consistent view of the topology by locking
2164   * the structures holding the topology view of the domain:
2165   * {@link #connectedDSs} and {@link #connectedRSs}. When a connection is
2166   * established with a peer DS or RS, the lock should be taken before updating
2167   * these structures, then released. The same mechanism should be used when
2168   * updating any data related to the view of the topology: for instance if the
2169   * status of a DS is changed, the lock should be taken before updating the
2170   * matching server handler and sending the topology messages to peers and
2171   * released after.... This allows every member of the topology to have a
2172   * consistent view of the topology and to be sure it will not miss some
2173   * information.
2174   * <p>
2175   * So the locking system must be called (not exhaustive list):
2176   * <ul>
2177   * <li>when connection established with a DS or RS</li>
2178   * <li>when connection ended with a DS or RS</li>
2179   * <li>when receiving a TopologyMsg and updating structures</li>
2180   * <li>when creating and sending a TopologyMsg</li>
2181   * <li>when a DS status is changing (ChangeStatusMsg received or sent)...</li>
2182   * </ul>
2183   */
2184  private final ReentrantLock lock = new ReentrantLock();
2185
2186  /**
2187   * This lock is used to protect the generationId variable.
2188   */
2189  private final Object generationIDLock = new Object();
2190
2191  /**
2192   * Tests if the current thread has the lock on this domain.
2193   * @return True if the current thread has the lock.
2194   */
2195  public boolean hasLock()
2196  {
2197    return lock.getHoldCount() > 0;
2198  }
2199
2200  /**
2201   * Takes the lock on this domain (blocking until lock can be acquired) or
2202   * calling thread is interrupted.
2203   * @throws java.lang.InterruptedException If interrupted.
2204   */
2205  public void lock() throws InterruptedException
2206  {
2207    lock.lockInterruptibly();
2208  }
2209
2210  /**
2211   * Releases the lock on this domain.
2212   */
2213  public void release()
2214  {
2215    lock.unlock();
2216  }
2217
2218  /**
2219   * Tries to acquire the lock on the domain within a given amount of time.
2220   * @param timeout The amount of milliseconds to wait for acquiring the lock.
2221   * @return True if the lock was acquired, false if timeout occurred.
2222   * @throws java.lang.InterruptedException When call was interrupted.
2223   */
2224  public boolean tryLock(long timeout) throws InterruptedException
2225  {
2226    return lock.tryLock(timeout, TimeUnit.MILLISECONDS);
2227  }
2228
2229  /**
2230   * Starts the monitoring publisher for the domain if not already started.
2231   */
2232  private void startMonitoringPublisher()
2233  {
2234    long period = localReplicationServer.getMonitoringPublisherPeriod();
2235    if (period > 0) // 0 means no monitoring publisher
2236    {
2237      final MonitoringPublisher thread = new MonitoringPublisher(this, period);
2238      if (monitoringPublisher.compareAndSet(null, thread))
2239      {
2240        thread.start();
2241      }
2242    }
2243  }
2244
2245  /**
2246   * Stops the monitoring publisher for the domain.
2247   */
2248  private void stopMonitoringPublisher()
2249  {
2250    final MonitoringPublisher thread = monitoringPublisher.get();
2251    if (thread != null && monitoringPublisher.compareAndSet(thread, null))
2252    {
2253      thread.shutdown();
2254      thread.waitForShutdown();
2255    }
2256  }
2257
2258  /** {@inheritDoc} */
2259  @Override
2260  public void initializeMonitorProvider(MonitorProviderCfg configuraiton)
2261  {
2262    // Nothing to do for now
2263  }
2264
2265  /** {@inheritDoc} */
2266  @Override
2267  public String getMonitorInstanceName()
2268  {
2269    return "Replication server RS(" + localReplicationServer.getServerId()
2270        + ") " + localReplicationServer.getServerURL() + ",cn="
2271        + baseDN.toString().replace(',', '_').replace('=', '_')
2272        + ",cn=Replication";
2273  }
2274
2275  @Override
2276  public MonitorData getMonitorData()
2277  {
2278    int serverId = localReplicationServer.getServerId();
2279
2280    final MonitorData attributes = new MonitorData(5);
2281    attributes.add("replication-server-id", serverId);
2282    attributes.add("replication-server-port", localReplicationServer.getReplicationPort());
2283    attributes.add("domain-name", baseDN);
2284    attributes.add("generation-id", baseDN + " " + generationId);
2285    attributes.add("missing-changes", getDomainMonitorData().getMissingChangesRS(serverId));
2286    return attributes;
2287  }
2288
2289  /**
2290   * Returns the oldest known state for the domain, made of the oldest CSN
2291   * stored for each serverId.
2292   * <p>
2293   * Note: Because the replication changelogDB trimming always keep one change
2294   * whatever its date, the CSN contained in the returned state can be very old.
2295   *
2296   * @return the start state of the domain.
2297   */
2298  public ServerState getOldestState()
2299  {
2300    return domainDB.getDomainOldestCSNs(baseDN);
2301  }
2302
2303  private void sendTopologyMsg(String type, ServerHandler handler, TopologyMsg msg)
2304  {
2305    for (int i = 1; i <= 2; i++)
2306    {
2307      if (!handler.shuttingDown()
2308          && handler.getStatus() != ServerStatus.NOT_CONNECTED_STATUS)
2309      {
2310        try
2311        {
2312          handler.sendTopoInfo(msg);
2313          break;
2314        }
2315        catch (IOException e)
2316        {
2317          if (i == 2)
2318          {
2319            logger.error(ERR_EXCEPTION_SENDING_TOPO_INFO,
2320                baseDN, type, handler.getServerId(), e.getMessage());
2321          }
2322        }
2323      }
2324      sleep(100);
2325    }
2326  }
2327
2328
2329
2330  /**
2331   * Processes a ChangeTimeHeartbeatMsg received, by storing the CSN (timestamp)
2332   * value received, and forwarding the message to the other RSes.
2333   * @param senderHandler The handler for the server that sent the heartbeat.
2334   * @param msg The message to process.
2335   * @throws DirectoryException
2336   *           if a problem occurs
2337   */
2338  void processChangeTimeHeartbeatMsg(ServerHandler senderHandler,
2339      ChangeTimeHeartbeatMsg msg) throws DirectoryException
2340  {
2341    try
2342    {
2343      domainDB.replicaHeartbeat(baseDN, msg.getCSN());
2344    }
2345    catch (ChangelogException e)
2346    {
2347      throw new DirectoryException(ResultCode.OPERATIONS_ERROR, e
2348          .getMessageObject(), e);
2349    }
2350
2351    if (senderHandler.isDataServer())
2352    {
2353      /*
2354       * If we are the first replication server warned, then forward the message
2355       * to the remote replication servers.
2356       */
2357      synchronized (pendingStatusMessagesLock)
2358      {
2359        pendingStatusMessages.enqueueChangeTimeHeartbeatMsg(msg);
2360      }
2361      statusAnalyzer.notifyPendingStatusMessage();
2362    }
2363  }
2364
2365  /**
2366   * Return the monitor instance name of the ReplicationServer that created the
2367   * current instance.
2368   *
2369   * @return the monitor instance name of the ReplicationServer that created the
2370   *         current instance.
2371   */
2372  String getLocalRSMonitorInstanceName()
2373  {
2374    return this.localReplicationServer.getMonitorInstanceName();
2375  }
2376
2377  /**
2378   * Return the serverId of the ReplicationServer that created the current
2379   * instance.
2380   *
2381   * @return the serverId of the ReplicationServer that created the current
2382   *         instance.
2383   */
2384  int getLocalRSServerId()
2385  {
2386    return this.localReplicationServer.getServerId();
2387  }
2388
2389  /**
2390   * Update the monitoring publisher with the new period value.
2391   *
2392   * @param period
2393   *          The new period value.
2394   */
2395  void updateMonitoringPeriod(long period)
2396  {
2397    if (period == 0)
2398    {
2399      // Requested to stop monitoring publishers
2400      stopMonitoringPublisher();
2401      return;
2402    }
2403
2404    final MonitoringPublisher mpThread = monitoringPublisher.get();
2405    if (mpThread != null) // it is running
2406    {
2407      mpThread.setPeriod(period);
2408    }
2409    else if (!connectedDSs.isEmpty() || !connectedRSs.isEmpty())
2410    {
2411      // Requested to start monitoring publishers with provided period value
2412      startMonitoringPublisher();
2413    }
2414  }
2415
2416  /**
2417   * Registers a DS handler into this domain and notifies the domain about the
2418   * new DS.
2419   *
2420   * @param dsHandler
2421   *          The Directory Server Handler to register
2422   */
2423  public void register(DataServerHandler dsHandler)
2424  {
2425    startMonitoringPublisher();
2426
2427    // connected with new DS: store handler.
2428    connectedDSs.put(dsHandler.getServerId(), dsHandler);
2429
2430    // Tell peer RSs and DSs a new DS just connected to us
2431    // No need to re-send TopologyMsg to this just new DS
2432    enqueueTopoInfoToAllExcept(dsHandler);
2433  }
2434
2435  /**
2436   * Registers the RS handler into this domain and notifies the domain.
2437   *
2438   * @param rsHandler
2439   *          The Replication Server Handler to register
2440   */
2441  public void register(ReplicationServerHandler rsHandler)
2442  {
2443    startMonitoringPublisher();
2444
2445    // connected with new RS (either outgoing or incoming
2446    // connection): store handler.
2447    connectedRSs.put(rsHandler.getServerId(), rsHandler);
2448  }
2449
2450  private void debug(String message)
2451  {
2452    logger.trace("In ReplicationServerDomain serverId="
2453        + localReplicationServer.getServerId() + " for baseDN=" + baseDN
2454        + " and port=" + localReplicationServer.getReplicationPort()
2455        + ": " + message);
2456  }
2457
2458
2459
2460  /**
2461   * Go through each connected DS, get the number of pending changes we have for
2462   * it and change status accordingly if threshold value is crossed/uncrossed.
2463   */
2464  void checkDSDegradedStatus()
2465  {
2466    final int degradedStatusThreshold = localReplicationServer
2467        .getDegradedStatusThreshold();
2468    // Threshold value = 0 means no status analyzer (no degrading system)
2469    // we should not have that as the status analyzer thread should not be
2470    // created if this is the case, but for sanity purpose, we add this
2471    // test
2472    if (degradedStatusThreshold > 0)
2473    {
2474      for (DataServerHandler serverHandler : connectedDSs.values())
2475      {
2476        // Get number of pending changes for this server
2477        final int nChanges = serverHandler.getRcvMsgQueueSize();
2478        if (logger.isTraceEnabled())
2479        {
2480          logger.trace("In RS " + getLocalRSServerId() + ", for baseDN="
2481              + getBaseDN() + ": " + "Status analyzer: DS "
2482              + serverHandler.getServerId() + " has " + nChanges
2483              + " message(s) in writer queue.");
2484        }
2485
2486        // Check status to know if it is relevant to change the status. Do not
2487        // take RSD lock to test. If we attempt to change the status whereas
2488        // the current status does allow it, this will be noticed by
2489        // the changeStatusFromStatusAnalyzer() method. This allows to take the
2490        // lock roughly only when needed versus every sleep time timeout.
2491        if (nChanges >= degradedStatusThreshold)
2492        {
2493          if (serverHandler.getStatus() == NORMAL_STATUS
2494              && changeStatus(serverHandler, TO_DEGRADED_STATUS_EVENT))
2495          {
2496            break; // Interrupted.
2497          }
2498        }
2499        else
2500        {
2501          if (serverHandler.getStatus() == DEGRADED_STATUS
2502              && changeStatus(serverHandler, TO_NORMAL_STATUS_EVENT))
2503          {
2504            break; // Interrupted.
2505          }
2506        }
2507      }
2508    }
2509  }
2510
2511
2512
2513  /**
2514   * Sends any enqueued status messages to the rest of the topology.
2515   */
2516  void sendPendingStatusMessages()
2517  {
2518    /*
2519     * Take a snapshot of pending status notifications in order to avoid holding
2520     * the broadcast lock for too long. In addition, clear the notifications so
2521     * that they are not resent the next time.
2522     */
2523    final PendingStatusMessages savedState;
2524    synchronized (pendingStatusMessagesLock)
2525    {
2526      savedState = pendingStatusMessages;
2527      pendingStatusMessages = new PendingStatusMessages();
2528    }
2529    sendPendingChangeTimeHeartbeatMsgs(savedState);
2530    sendPendingTopologyMsgs(savedState);
2531    sendPendingMonitorMsgs(savedState);
2532  }
2533
2534
2535
2536  private void sendPendingMonitorMsgs(final PendingStatusMessages pendingMsgs)
2537  {
2538    for (Entry<Integer, MonitorMsg> msg : pendingMsgs.pendingDSMonitorMsgs
2539        .entrySet())
2540    {
2541      ServerHandler ds = connectedDSs.get(msg.getKey());
2542      if (ds != null)
2543      {
2544        try
2545        {
2546          ds.send(msg.getValue());
2547        }
2548        catch (IOException e)
2549        {
2550          // Ignore: connection closed.
2551        }
2552      }
2553    }
2554    for (Entry<Integer, MonitorMsg> msg : pendingMsgs.pendingRSMonitorMsgs
2555        .entrySet())
2556    {
2557      ServerHandler rs = connectedRSs.get(msg.getKey());
2558      if (rs != null)
2559      {
2560        try
2561        {
2562          rs.send(msg.getValue());
2563        }
2564        catch (IOException e)
2565        {
2566          // We log the error. The requestor will detect a timeout or
2567          // any other failure on the connection.
2568
2569          // FIXME: why do we log for RSs but not DSs?
2570          logger.traceException(e);
2571          logger.error(ERR_CHANGELOG_ERROR_SENDING_MSG, msg.getValue().getDestination());
2572        }
2573      }
2574    }
2575  }
2576
2577
2578
2579  private void sendPendingChangeTimeHeartbeatMsgs(PendingStatusMessages pendingMsgs)
2580  {
2581    for (ChangeTimeHeartbeatMsg pendingHeartbeat : pendingMsgs.pendingHeartbeats.values())
2582    {
2583      for (ReplicationServerHandler rsHandler : connectedRSs.values())
2584      {
2585        try
2586        {
2587          if (rsHandler.getProtocolVersion() >= REPLICATION_PROTOCOL_V3)
2588          {
2589            rsHandler.send(pendingHeartbeat);
2590          }
2591        }
2592        catch (IOException e)
2593        {
2594          logger.traceException(e);
2595          logger.error(ERR_CHANGELOG_ERROR_SENDING_MSG, "Replication Server "
2596              + localReplicationServer.getReplicationPort() + " " + baseDN
2597              + " " + localReplicationServer.getServerId());
2598          stopServer(rsHandler, false);
2599        }
2600      }
2601    }
2602  }
2603
2604
2605
2606  private void sendPendingTopologyMsgs(PendingStatusMessages pendingMsgs)
2607  {
2608    if (pendingMsgs.sendDSTopologyMsg)
2609    {
2610      for (ServerHandler handler : connectedDSs.values())
2611      {
2612        if (handler.getServerId() != pendingMsgs.excludedDSForTopologyMsg)
2613        {
2614          final TopologyMsg topoMsg = createTopologyMsgForDS(handler
2615              .getServerId());
2616          sendTopologyMsg("directory", handler, topoMsg);
2617        }
2618      }
2619    }
2620
2621    if (pendingMsgs.sendRSTopologyMsg && !connectedRSs.isEmpty())
2622    {
2623      final TopologyMsg topoMsg = createTopologyMsgForRS();
2624      for (ServerHandler handler : connectedRSs.values())
2625      {
2626        sendTopologyMsg("replication", handler, topoMsg);
2627      }
2628    }
2629  }
2630
2631
2632
2633  private void enqueueMonitorMsg(MonitorRequestMsg msg, ServerHandler sender)
2634  {
2635    /*
2636     * If the request comes from a Directory Server we need to build the full
2637     * list of all servers in the topology and send back a MonitorMsg with the
2638     * full list of all the servers in the topology.
2639     */
2640    if (sender.isDataServer())
2641    {
2642      MonitorMsg monitorMsg = createGlobalTopologyMonitorMsg(
2643          msg.getDestination(), msg.getSenderID(),
2644          domainMonitor.getMonitorData());
2645      synchronized (pendingStatusMessagesLock)
2646      {
2647        pendingStatusMessages.enqueueDSMonitorMsg(sender.getServerId(),
2648            monitorMsg);
2649      }
2650    }
2651    else
2652    {
2653      MonitorMsg monitorMsg = createLocalTopologyMonitorMsg(
2654          msg.getDestination(), msg.getSenderID());
2655      synchronized (pendingStatusMessagesLock)
2656      {
2657        pendingStatusMessages.enqueueRSMonitorMsg(sender.getServerId(),
2658            monitorMsg);
2659      }
2660    }
2661    statusAnalyzer.notifyPendingStatusMessage();
2662  }
2663}