001/* 002 * The contents of this file are subject to the terms of the Common Development and 003 * Distribution License (the License). You may not use this file except in compliance with the 004 * License. 005 * 006 * You can obtain a copy of the License at legal/CDDLv1.0.txt. See the License for the 007 * specific language governing permission and limitations under the License. 008 * 009 * When distributing Covered Software, include this CDDL Header Notice in each file and include 010 * the License file at legal/CDDLv1.0.txt. If applicable, add the following below the CDDL 011 * Header, with the fields enclosed by brackets [] replaced by your own identifying 012 * information: "Portions Copyright [year] [name of copyright owner]". 013 * 014 * Copyright 2006-2010 Sun Microsystems, Inc. 015 * Portions Copyright 2011-2016 ForgeRock AS. 016 */ 017package org.opends.server.replication.server; 018 019import java.io.IOException; 020import java.util.ArrayList; 021import java.util.Collection; 022import java.util.Collections; 023import java.util.HashMap; 024import java.util.List; 025import java.util.Map; 026import java.util.Map.Entry; 027import java.util.Timer; 028import java.util.TimerTask; 029import java.util.concurrent.ConcurrentHashMap; 030import java.util.concurrent.TimeUnit; 031import java.util.concurrent.atomic.AtomicReference; 032import java.util.concurrent.locks.ReentrantLock; 033 034import net.jcip.annotations.GuardedBy; 035 036import org.forgerock.i18n.LocalizableMessage; 037import org.forgerock.i18n.LocalizableMessageBuilder; 038import org.forgerock.i18n.slf4j.LocalizedLogger; 039import org.forgerock.opendj.ldap.ResultCode; 040import org.opends.server.admin.std.server.MonitorProviderCfg; 041import org.opends.server.api.MonitorData; 042import org.opends.server.api.MonitorProvider; 043import org.opends.server.core.DirectoryServer; 044import org.opends.server.replication.common.CSN; 045import org.opends.server.replication.common.DSInfo; 046import org.opends.server.replication.common.RSInfo; 047import org.opends.server.replication.common.ServerState; 048import org.opends.server.replication.common.ServerStatus; 049import org.opends.server.replication.common.StatusMachineEvent; 050import org.opends.server.replication.protocol.AckMsg; 051import org.opends.server.replication.protocol.ChangeStatusMsg; 052import org.opends.server.replication.protocol.ChangeTimeHeartbeatMsg; 053import org.opends.server.replication.protocol.ErrorMsg; 054import org.opends.server.replication.protocol.MonitorMsg; 055import org.opends.server.replication.protocol.MonitorRequestMsg; 056import org.opends.server.replication.protocol.ReplicaOfflineMsg; 057import org.opends.server.replication.protocol.ResetGenerationIdMsg; 058import org.opends.server.replication.protocol.RoutableMsg; 059import org.opends.server.replication.protocol.TopologyMsg; 060import org.opends.server.replication.protocol.UpdateMsg; 061import org.opends.server.replication.server.changelog.api.ChangelogException; 062import org.opends.server.replication.server.changelog.api.DBCursor; 063import org.opends.server.replication.server.changelog.api.DBCursor.CursorOptions; 064import org.opends.server.replication.server.changelog.api.ReplicationDomainDB; 065import org.forgerock.opendj.ldap.DN; 066import org.opends.server.types.DirectoryException; 067import org.opends.server.types.HostPort; 068 069import static org.opends.messages.ReplicationMessages.*; 070import static org.opends.server.replication.common.ServerStatus.*; 071import static org.opends.server.replication.common.StatusMachineEvent.*; 072import static org.opends.server.replication.protocol.ProtocolVersion.*; 073import static org.opends.server.replication.server.changelog.api.DBCursor.KeyMatchingStrategy.*; 074import static org.opends.server.replication.server.changelog.api.DBCursor.PositionStrategy.*; 075import static org.opends.server.util.CollectionUtils.*; 076import static org.opends.server.util.StaticUtils.*; 077 078/** 079 * This class define an in-memory cache that will be used to store 080 * the messages that have been received from an LDAP server or 081 * from another replication server and that should be forwarded to 082 * other servers. 083 * 084 * The size of the cache is set by configuration. 085 * If the cache becomes bigger than the configured size, the older messages 086 * are removed and should they be needed again must be read from the backing 087 * file 088 * 089 * it runs a thread that is responsible for saving the messages 090 * received to the disk and for trimming them 091 * Decision to trim can be based on disk space or age of the message 092 */ 093public class ReplicationServerDomain extends MonitorProvider<MonitorProviderCfg> 094{ 095 private final DN baseDN; 096 097 /** 098 * Periodically verifies whether the connected DSs are late and publishes any 099 * pending status messages. 100 */ 101 private final StatusAnalyzer statusAnalyzer; 102 103 /** 104 * The monitoring publisher that periodically sends monitoring messages to the 105 * topology. Using an AtomicReference to avoid leaking references to costly 106 * threads. 107 */ 108 private final AtomicReference<MonitoringPublisher> monitoringPublisher = new AtomicReference<>(); 109 /** Maintains monitor data for the current domain. */ 110 private final ReplicationDomainMonitor domainMonitor = new ReplicationDomainMonitor(this); 111 112 /** 113 * The following map contains one balanced tree for each replica ID to which 114 * we are currently publishing the first update in the balanced tree is the 115 * next change that we must push to this particular server. 116 */ 117 private final Map<Integer, DataServerHandler> connectedDSs = new ConcurrentHashMap<>(); 118 119 /** 120 * This map contains one ServerHandler for each replication servers with which 121 * we are connected (so normally all the replication servers) the first update 122 * in the balanced tree is the next change that we must push to this 123 * particular server. 124 */ 125 private final Map<Integer, ReplicationServerHandler> connectedRSs = new ConcurrentHashMap<>(); 126 127 private final ReplicationDomainDB domainDB; 128 /** The ReplicationServer that created the current instance. */ 129 private final ReplicationServer localReplicationServer; 130 131 /** 132 * The generationId of the current replication domain. The generationId is 133 * computed by hashing the first 1000 entries in the DB. 134 */ 135 private volatile long generationId = -1; 136 /** 137 * JNR, this is legacy code, hard to follow logic. I think what this field 138 * tries to say is: "is the generationId in use anywhere?", i.e. is there a 139 * replication topology in place? As soon as an answer to any of these 140 * question comes true, then it is set to true. 141 * <p> 142 * It looks like the only use of this field is to prevent the 143 * {@link #generationId} from being reset by 144 * {@link #resetGenerationIdIfPossible()}. 145 */ 146 private volatile boolean generationIdSavedStatus; 147 148 /** The tracer object for the debug logger. */ 149 private static final LocalizedLogger logger = LocalizedLogger.getLoggerForThisClass(); 150 151 /** 152 * The needed info for each received assured update message we are waiting 153 * acks for. 154 * <p> 155 * Key: a CSN matching a received update message which requested 156 * assured mode usage (either safe read or safe data mode) 157 * <p> 158 * Value: The object holding every info needed about the already received acks 159 * as well as the acks to be received. 160 * 161 * @see ExpectedAcksInfo For more details, see ExpectedAcksInfo and its sub 162 * classes javadoc. 163 */ 164 private final Map<CSN, ExpectedAcksInfo> waitingAcks = new ConcurrentHashMap<>(); 165 166 /** 167 * The timer used to run the timeout code (timer tasks) for the assured update 168 * messages we are waiting acks for. 169 */ 170 private final Timer assuredTimeoutTimer; 171 /** 172 * Counter used to purge the timer tasks references in assuredTimeoutTimer, 173 * every n number of treated assured messages. 174 */ 175 private int assuredTimeoutTimerPurgeCounter; 176 177 178 179 /** 180 * Stores pending status messages such as DS change time heartbeats for future 181 * forwarding to the rest of the topology. This class is required in order to 182 * decouple inbound IO processing from outbound IO processing and avoid 183 * potential inter-process deadlocks. In particular, the {@code ServerReader} 184 * thread must not send messages. 185 */ 186 private static class PendingStatusMessages 187 { 188 private final Map<Integer, ChangeTimeHeartbeatMsg> pendingHeartbeats = new HashMap<>(1); 189 private final Map<Integer, MonitorMsg> pendingDSMonitorMsgs = new HashMap<>(1); 190 private final Map<Integer, MonitorMsg> pendingRSMonitorMsgs = new HashMap<>(1); 191 private boolean sendRSTopologyMsg; 192 private boolean sendDSTopologyMsg; 193 private int excludedDSForTopologyMsg = -1; 194 195 /** 196 * Enqueues a TopologyMsg for all the connected directory servers in order 197 * to let them know the topology (every known DSs and RSs). 198 * 199 * @param excludedDS 200 * If not null, the topology message will not be sent to this DS. 201 */ 202 private void enqueueTopoInfoToAllDSsExcept(DataServerHandler excludedDS) 203 { 204 int excludedServerId = excludedDS != null ? excludedDS.getServerId() : -1; 205 if (sendDSTopologyMsg) 206 { 207 if (excludedServerId != excludedDSForTopologyMsg) 208 { 209 excludedDSForTopologyMsg = -1; 210 } 211 } 212 else 213 { 214 sendDSTopologyMsg = true; 215 excludedDSForTopologyMsg = excludedServerId; 216 } 217 } 218 219 /** 220 * Enqueues a TopologyMsg for all the connected replication servers in order 221 * to let them know our connected LDAP servers. 222 */ 223 private void enqueueTopoInfoToAllRSs() 224 { 225 sendRSTopologyMsg = true; 226 } 227 228 /** 229 * Enqueues a ChangeTimeHeartbeatMsg received from a DS for forwarding to 230 * all other RS instances. 231 * 232 * @param msg 233 * The heartbeat message. 234 */ 235 private void enqueueChangeTimeHeartbeatMsg(ChangeTimeHeartbeatMsg msg) 236 { 237 pendingHeartbeats.put(msg.getCSN().getServerId(), msg); 238 } 239 240 private void enqueueDSMonitorMsg(int dsServerId, MonitorMsg msg) 241 { 242 pendingDSMonitorMsgs.put(dsServerId, msg); 243 } 244 245 private void enqueueRSMonitorMsg(int rsServerId, MonitorMsg msg) 246 { 247 pendingRSMonitorMsgs.put(rsServerId, msg); 248 } 249 250 /** {@inheritDoc} */ 251 @Override 252 public String toString() 253 { 254 return getClass().getSimpleName() 255 + " pendingHeartbeats=" + pendingHeartbeats 256 + ", pendingDSMonitorMsgs=" + pendingDSMonitorMsgs 257 + ", pendingRSMonitorMsgs=" + pendingRSMonitorMsgs 258 + ", sendRSTopologyMsg=" + sendRSTopologyMsg 259 + ", sendDSTopologyMsg=" + sendDSTopologyMsg 260 + ", excludedDSForTopologyMsg=" + excludedDSForTopologyMsg; 261 } 262 } 263 264 private final Object pendingStatusMessagesLock = new Object(); 265 266 @GuardedBy("pendingStatusMessagesLock") 267 private PendingStatusMessages pendingStatusMessages = new PendingStatusMessages(); 268 269 /** 270 * Creates a new ReplicationServerDomain associated to the baseDN. 271 * 272 * @param baseDN 273 * The baseDN associated to the ReplicationServerDomain. 274 * @param localReplicationServer 275 * the ReplicationServer that created this instance. 276 */ 277 public ReplicationServerDomain(DN baseDN, 278 ReplicationServer localReplicationServer) 279 { 280 this.baseDN = baseDN; 281 this.localReplicationServer = localReplicationServer; 282 this.assuredTimeoutTimer = new Timer("Replication server RS(" 283 + localReplicationServer.getServerId() 284 + ") assured timer for domain \"" + baseDN + "\"", true); 285 this.domainDB = 286 localReplicationServer.getChangelogDB().getReplicationDomainDB(); 287 this.statusAnalyzer = new StatusAnalyzer(this); 288 this.statusAnalyzer.start(); 289 DirectoryServer.registerMonitorProvider(this); 290 } 291 292 /** 293 * Add an update that has been received to the list of 294 * updates that must be forwarded to all other servers. 295 * 296 * @param updateMsg The update that has been received. 297 * @param sourceHandler The ServerHandler for the server from which the 298 * update was received 299 * @throws IOException When an IO exception happens during the update 300 * processing. 301 */ 302 public void put(UpdateMsg updateMsg, ServerHandler sourceHandler) throws IOException 303 { 304 sourceHandler.updateServerState(updateMsg); 305 sourceHandler.incrementInCount(); 306 setGenerationIdIfUnset(sourceHandler.getGenerationId()); 307 308 /** 309 * If this is an assured message (a message requesting ack), we must 310 * construct the ExpectedAcksInfo object with the right number of expected 311 * acks before posting message to the writers. Otherwise some writers may 312 * have time to post, receive the ack and increment received ack counter 313 * (kept in ExpectedAcksInfo object) and we could think the acknowledgment 314 * is fully processed although it may be not (some other acks from other 315 * servers are not yet arrived). So for that purpose we do a pre-loop 316 * to determine to who we will post an assured message. 317 * Whether the assured mode is safe read or safe data, we anyway do not 318 * support the assured replication feature across topologies with different 319 * group ids. The assured feature insures assured replication based on the 320 * same locality (group id). For instance in double data center deployment 321 * (2 group id usage) with assured replication enabled, an assured message 322 * sent from data center 1 (group id = 1) will be sent to servers of both 323 * data centers, but one will request and wait acks only from servers of the 324 * data center 1. 325 */ 326 final PreparedAssuredInfo preparedAssuredInfo = getPreparedAssuredInfo(updateMsg, sourceHandler); 327 328 if (!publishUpdateMsg(updateMsg)) 329 { 330 return; 331 } 332 333 final List<Integer> assuredServers = getAssuredServers(updateMsg, preparedAssuredInfo); 334 335 /** 336 * The update message equivalent to the originally received update message, 337 * but with assured flag disabled. This message is the one that should be 338 * sent to non eligible servers for assured mode. 339 * We need a clone like of the original message with assured flag off, to be 340 * posted to servers we don't want to wait the ack from (not normal status 341 * servers or servers with different group id). This must be done because 342 * the posted message is a reference so each writer queue gets the same 343 * reference, thus, changing the assured flag of an object is done for every 344 * references posted on every writer queues. That is why we need a message 345 * version with assured flag on and another one with assured flag off. 346 */ 347 final NotAssuredUpdateMsg notAssuredUpdateMsg = 348 preparedAssuredInfo != null ? new NotAssuredUpdateMsg(updateMsg) : null; 349 350 // Push the message to the replication servers 351 if (sourceHandler.isDataServer()) 352 { 353 for (ReplicationServerHandler rsHandler : connectedRSs.values()) 354 { 355 /** 356 * Ignore updates to RS with bad gen id 357 * (no system managed status for a RS) 358 */ 359 if (!isDifferentGenerationId(rsHandler, updateMsg)) 360 { 361 addUpdate(rsHandler, updateMsg, notAssuredUpdateMsg, assuredServers); 362 } 363 } 364 } 365 366 // Push the message to the LDAP servers 367 for (DataServerHandler dsHandler : connectedDSs.values()) 368 { 369 // Do not forward the change to the server that just sent it 370 if (dsHandler != sourceHandler 371 && !isUpdateMsgFiltered(updateMsg, dsHandler)) 372 { 373 addUpdate(dsHandler, updateMsg, notAssuredUpdateMsg, assuredServers); 374 } 375 } 376 } 377 378 private boolean isDifferentGenerationId(ReplicationServerHandler rsHandler, 379 UpdateMsg updateMsg) 380 { 381 final boolean isDifferent = isDifferentGenerationId(rsHandler.getGenerationId()); 382 if (isDifferent && logger.isTraceEnabled()) 383 { 384 debug("updateMsg " + updateMsg.getCSN() 385 + " will not be sent to replication server " 386 + rsHandler.getServerId() + " with generation id " 387 + rsHandler.getGenerationId() + " different from local " 388 + "generation id " + generationId); 389 } 390 return isDifferent; 391 } 392 393 /** 394 * Ignore updates to DS in bad BAD_GENID_STATUS or FULL_UPDATE_STATUS. 395 * <p> 396 * The RSD lock should not be taken here as it is acceptable to have a delay 397 * between the time the server has a wrong status and the fact we detect it: 398 * the updates that succeed to pass during this time will have no impact on 399 * remote server. But it is interesting to not saturate uselessly the network 400 * if the updates are not necessary so this check to stop sending updates is 401 * interesting anyway. Not taking the RSD lock allows to have better 402 * performances in normal mode (most of the time). 403 */ 404 private boolean isUpdateMsgFiltered(UpdateMsg updateMsg, DataServerHandler dsHandler) 405 { 406 final ServerStatus dsStatus = dsHandler.getStatus(); 407 if (dsStatus == ServerStatus.BAD_GEN_ID_STATUS) 408 { 409 if (logger.isTraceEnabled()) 410 { 411 debug("updateMsg " + updateMsg.getCSN() 412 + " will not be sent to directory server " 413 + dsHandler.getServerId() + " with generation id " 414 + dsHandler.getGenerationId() + " different from local " 415 + "generation id " + generationId); 416 } 417 return true; 418 } 419 else if (dsStatus == ServerStatus.FULL_UPDATE_STATUS) 420 { 421 if (logger.isTraceEnabled()) 422 { 423 debug("updateMsg " + updateMsg.getCSN() 424 + " will not be sent to directory server " 425 + dsHandler.getServerId() + " as it is in full update"); 426 } 427 return true; 428 } 429 return false; 430 } 431 432 private PreparedAssuredInfo getPreparedAssuredInfo(UpdateMsg updateMsg, 433 ServerHandler sourceHandler) throws IOException 434 { 435 // Assured feature is supported starting from replication protocol V2 436 if (!updateMsg.isAssured() 437 || sourceHandler.getProtocolVersion() < REPLICATION_PROTOCOL_V2) 438 { 439 return null; 440 } 441 442 // According to assured sub-mode, prepare structures to keep track of 443 // the acks we are interested in. 444 switch (updateMsg.getAssuredMode()) 445 { 446 case SAFE_DATA_MODE: 447 sourceHandler.incrementAssuredSdReceivedUpdates(); 448 return processSafeDataUpdateMsg(updateMsg, sourceHandler); 449 450 case SAFE_READ_MODE: 451 sourceHandler.incrementAssuredSrReceivedUpdates(); 452 return processSafeReadUpdateMsg(updateMsg, sourceHandler); 453 454 default: 455 // Unknown assured mode: should never happen 456 logger.error(ERR_RS_UNKNOWN_ASSURED_MODE, 457 localReplicationServer.getServerId(), updateMsg.getAssuredMode(), baseDN, updateMsg); 458 return null; 459 } 460 } 461 462 private List<Integer> getAssuredServers(UpdateMsg updateMsg, PreparedAssuredInfo preparedAssuredInfo) 463 { 464 List<Integer> expectedServers = null; 465 if (preparedAssuredInfo != null && preparedAssuredInfo.expectedServers != null) 466 { 467 expectedServers = preparedAssuredInfo.expectedServers; 468 // Store the expected acks info into the global map. 469 // The code for processing reception of acks for this update will update 470 // info kept in this object and if enough acks received, it will send 471 // back the final ack to the requester and remove the object from this map 472 // OR 473 // The following timer will time out and send an timeout ack to the 474 // requester if the acks are not received in time. The timer will also 475 // remove the object from this map. 476 final CSN csn = updateMsg.getCSN(); 477 waitingAcks.put(csn, preparedAssuredInfo.expectedAcksInfo); 478 479 // Arm timer for this assured update message (wait for acks until it times out) 480 final AssuredTimeoutTask assuredTimeoutTask = new AssuredTimeoutTask(csn); 481 assuredTimeoutTimer.schedule(assuredTimeoutTask, localReplicationServer.getAssuredTimeout()); 482 // Purge timer every 100 treated messages 483 assuredTimeoutTimerPurgeCounter++; 484 if ((assuredTimeoutTimerPurgeCounter % 100) == 0) 485 { 486 assuredTimeoutTimer.purge(); 487 } 488 } 489 490 return expectedServers != null ? expectedServers : Collections.<Integer> emptyList(); 491 } 492 493 private boolean publishUpdateMsg(UpdateMsg updateMsg) 494 { 495 try 496 { 497 if (updateMsg instanceof ReplicaOfflineMsg) 498 { 499 final ReplicaOfflineMsg offlineMsg = (ReplicaOfflineMsg) updateMsg; 500 this.domainDB.notifyReplicaOffline(baseDN, offlineMsg.getCSN()); 501 return true; 502 } 503 504 if (this.domainDB.publishUpdateMsg(baseDN, updateMsg)) 505 { 506 /* 507 * JNR: Matt and I had a hard time figuring out where to put this 508 * synchronized block. We elected to put it here, but without a strong 509 * conviction. 510 */ 511 synchronized (generationIDLock) 512 { 513 /* 514 * JNR: I think the generationIdSavedStatus is set to true because 515 * method above created a ReplicaDB which assumes the generationId was 516 * communicated to another server. Hence setting true on this field 517 * prevent the generationId from being reset. 518 */ 519 generationIdSavedStatus = true; 520 } 521 } 522 return true; 523 } 524 catch (ChangelogException e) 525 { 526 /* 527 * Because of database problem we can't save any more changes from at 528 * least one LDAP server. This replicationServer therefore can't do it's 529 * job properly anymore and needs to close all its connections and 530 * shutdown itself. 531 */ 532 logger.error(ERR_CHANGELOG_SHUTDOWN_DATABASE_ERROR, stackTraceToSingleLineString(e)); 533 localReplicationServer.shutdown(); 534 return false; 535 } 536 } 537 538 private void addUpdate(ServerHandler sHandler, UpdateMsg updateMsg, 539 NotAssuredUpdateMsg notAssuredUpdateMsg, List<Integer> assuredServers) 540 { 541 // Assured mode: post an assured or not assured matching update message 542 // according to what has been computed for the destination server 543 if (notAssuredUpdateMsg != null 544 && !assuredServers.contains(sHandler.getServerId())) 545 { 546 sHandler.add(notAssuredUpdateMsg); 547 } 548 else 549 { 550 sHandler.add(updateMsg); 551 } 552 } 553 554 /** 555 * Helper class to be the return type of a method that processes a just 556 * received assured update message: 557 * - processSafeReadUpdateMsg 558 * - processSafeDataUpdateMsg 559 * This is a facility to pack many interesting returned object. 560 */ 561 private class PreparedAssuredInfo 562 { 563 /** 564 * The list of servers identified as servers we are interested in 565 * receiving acks from. If this list is not null, then expectedAcksInfo 566 * should be not null. 567 * Servers that are not in this list are servers not eligible for an ack 568 * request. 569 */ 570 public List<Integer> expectedServers; 571 572 /** 573 * The constructed ExpectedAcksInfo object to be used when acks will be 574 * received. Null if expectedServers is null. 575 */ 576 public ExpectedAcksInfo expectedAcksInfo; 577 } 578 579 /** 580 * Process a just received assured update message in Safe Read mode. If the 581 * ack can be sent immediately, it is done here. This will also determine to 582 * which suitable servers an ack should be requested from, and which ones are 583 * not eligible for an ack request. 584 * This method is an helper method for the put method. Have a look at the put 585 * method for a better understanding. 586 * @param update The just received assured update to process. 587 * @param sourceHandler The ServerHandler for the server from which the 588 * update was received 589 * @return A suitable PreparedAssuredInfo object that contains every needed 590 * info to proceed with post to server writers. 591 * @throws IOException When an IO exception happens during the update 592 * processing. 593 */ 594 private PreparedAssuredInfo processSafeReadUpdateMsg( 595 UpdateMsg update, ServerHandler sourceHandler) throws IOException 596 { 597 CSN csn = update.getCSN(); 598 byte groupId = localReplicationServer.getGroupId(); 599 byte sourceGroupId = sourceHandler.getGroupId(); 600 List<Integer> expectedServers = new ArrayList<>(); 601 List<Integer> wrongStatusServers = new ArrayList<>(); 602 603 if (sourceGroupId == groupId) 604 // Assured feature does not cross different group ids 605 { 606 if (sourceHandler.isDataServer()) 607 { 608 collectRSsEligibleForAssuredReplication(groupId, expectedServers); 609 } 610 611 // Look for DS eligible for assured 612 for (DataServerHandler dsHandler : connectedDSs.values()) 613 { 614 // Don't forward the change to the server that just sent it 615 if (dsHandler == sourceHandler) 616 { 617 continue; 618 } 619 if (dsHandler.getGroupId() == groupId) 620 // No ack expected from a DS with different group id 621 { 622 ServerStatus serverStatus = dsHandler.getStatus(); 623 if (serverStatus == ServerStatus.NORMAL_STATUS) 624 { 625 expectedServers.add(dsHandler.getServerId()); 626 } else if (serverStatus == ServerStatus.DEGRADED_STATUS) { 627 // No ack expected from a DS with wrong status 628 wrongStatusServers.add(dsHandler.getServerId()); 629 } 630 /* 631 * else 632 * BAD_GEN_ID_STATUS or FULL_UPDATE_STATUS: 633 * We do not want this to be reported as an error to the update 634 * maker -> no pollution or potential misunderstanding when 635 * reading logs or monitoring and it was just administration (for 636 * instance new server is being configured in topo: it goes in bad 637 * gen then full update). 638 */ 639 } 640 } 641 } 642 643 // Return computed structures 644 PreparedAssuredInfo preparedAssuredInfo = new PreparedAssuredInfo(); 645 if (!expectedServers.isEmpty()) 646 { 647 // Some other acks to wait for 648 preparedAssuredInfo.expectedAcksInfo = new SafeReadExpectedAcksInfo(csn, 649 sourceHandler, expectedServers, wrongStatusServers); 650 preparedAssuredInfo.expectedServers = expectedServers; 651 } 652 653 if (preparedAssuredInfo.expectedServers == null) 654 { 655 // No eligible servers found, send the ack immediately 656 sourceHandler.send(new AckMsg(csn)); 657 } 658 659 return preparedAssuredInfo; 660 } 661 662 /** 663 * Process a just received assured update message in Safe Data mode. If the 664 * ack can be sent immediately, it is done here. This will also determine to 665 * which suitable servers an ack should be requested from, and which ones are 666 * not eligible for an ack request. 667 * This method is an helper method for the put method. Have a look at the put 668 * method for a better understanding. 669 * @param update The just received assured update to process. 670 * @param sourceHandler The ServerHandler for the server from which the 671 * update was received 672 * @return A suitable PreparedAssuredInfo object that contains every needed 673 * info to proceed with post to server writers. 674 * @throws IOException When an IO exception happens during the update 675 * processing. 676 */ 677 private PreparedAssuredInfo processSafeDataUpdateMsg( 678 UpdateMsg update, ServerHandler sourceHandler) throws IOException 679 { 680 CSN csn = update.getCSN(); 681 boolean interestedInAcks = false; 682 byte safeDataLevel = update.getSafeDataLevel(); 683 byte groupId = localReplicationServer.getGroupId(); 684 byte sourceGroupId = sourceHandler.getGroupId(); 685 if (safeDataLevel < (byte) 1) 686 { 687 // Should never happen 688 logger.error(ERR_UNKNOWN_ASSURED_SAFE_DATA_LEVEL, 689 localReplicationServer.getServerId(), safeDataLevel, baseDN, update); 690 } else if (sourceGroupId == groupId 691 // Assured feature does not cross different group IDS 692 && isSameGenerationId(sourceHandler.getGenerationId())) 693 // Ignore assured updates from wrong generationId servers 694 { 695 if (sourceHandler.isDataServer()) 696 { 697 if (safeDataLevel == (byte) 1) 698 { 699 /** 700 * Immediately return the ack for an assured message in safe data 701 * mode with safe data level 1, coming from a DS. No need to wait 702 * for more acks 703 */ 704 sourceHandler.send(new AckMsg(csn)); 705 } else 706 { 707 /** 708 * level > 1 : We need further acks 709 * The message will be posted in assured mode to eligible 710 * servers. The embedded safe data level is not changed, and his 711 * value will be used by a remote RS to determine if he must send 712 * an ack (level > 1) or not (level = 1) 713 */ 714 interestedInAcks = true; 715 } 716 } else 717 { // A RS sent us the safe data message, for sure no further ack to wait 718 /** 719 * Level 1 has already been reached so no further acks to wait. 720 * Just deal with level > 1 721 */ 722 if (safeDataLevel > (byte) 1) 723 { 724 sourceHandler.send(new AckMsg(csn)); 725 } 726 } 727 } 728 729 List<Integer> expectedServers = new ArrayList<>(); 730 if (interestedInAcks && sourceHandler.isDataServer()) 731 { 732 collectRSsEligibleForAssuredReplication(groupId, expectedServers); 733 } 734 735 // Return computed structures 736 PreparedAssuredInfo preparedAssuredInfo = new PreparedAssuredInfo(); 737 int nExpectedServers = expectedServers.size(); 738 if (interestedInAcks) // interestedInAcks so level > 1 739 { 740 if (nExpectedServers > 0) 741 { 742 // Some other acks to wait for 743 int sdl = update.getSafeDataLevel(); 744 int neededAdditionalServers = sdl - 1; 745 // Change the number of expected acks if not enough available eligible 746 // servers: the level is a best effort thing, we do not want to timeout 747 // at every assured SD update for instance if a RS has had his gen id 748 // reseted 749 byte finalSdl = (nExpectedServers >= neededAdditionalServers) ? 750 (byte)sdl : // Keep level as it was 751 (byte)(nExpectedServers+1); // Change level to match what's available 752 preparedAssuredInfo.expectedAcksInfo = new SafeDataExpectedAcksInfo(csn, 753 sourceHandler, finalSdl, expectedServers); 754 preparedAssuredInfo.expectedServers = expectedServers; 755 } else 756 { 757 // level > 1 and source is a DS but no eligible servers found, send the 758 // ack immediately 759 sourceHandler.send(new AckMsg(csn)); 760 } 761 } 762 763 return preparedAssuredInfo; 764 } 765 766 private void collectRSsEligibleForAssuredReplication(byte groupId, 767 List<Integer> expectedServers) 768 { 769 for (ReplicationServerHandler rsHandler : connectedRSs.values()) 770 { 771 if (rsHandler.getGroupId() == groupId 772 // No ack expected from a RS with different group id 773 && isSameGenerationId(rsHandler.getGenerationId()) 774 // No ack expected from a RS with bad gen id 775 ) 776 { 777 expectedServers.add(rsHandler.getServerId()); 778 } 779 } 780 } 781 782 private boolean isSameGenerationId(long generationId) 783 { 784 return this.generationId > 0 && this.generationId == generationId; 785 } 786 787 private boolean isDifferentGenerationId(long generationId) 788 { 789 return this.generationId > 0 && this.generationId != generationId; 790 } 791 792 /** 793 * Process an ack received from a given server. 794 * 795 * @param ack The ack message received. 796 * @param ackingServer The server handler of the server that sent the ack. 797 */ 798 void processAck(AckMsg ack, ServerHandler ackingServer) 799 { 800 // Retrieve the expected acks info for the update matching the original 801 // sent update. 802 CSN csn = ack.getCSN(); 803 ExpectedAcksInfo expectedAcksInfo = waitingAcks.get(csn); 804 805 if (expectedAcksInfo != null) 806 { 807 // Prevent concurrent access from processAck() or AssuredTimeoutTask.run() 808 synchronized (expectedAcksInfo) 809 { 810 if (expectedAcksInfo.isCompleted()) 811 { 812 // Timeout code is sending a timeout ack, do nothing and let him 813 // remove object from the map 814 return; 815 } 816 /** 817 * 818 * If this is the last ack we were waiting from, immediately create and 819 * send the final ack to the original server 820 */ 821 if (expectedAcksInfo.processReceivedAck(ackingServer, ack)) 822 { 823 // Remove the object from the map as no more needed 824 waitingAcks.remove(csn); 825 AckMsg finalAck = expectedAcksInfo.createAck(false); 826 ServerHandler origServer = expectedAcksInfo.getRequesterServer(); 827 try 828 { 829 origServer.send(finalAck); 830 } catch (IOException e) 831 { 832 /** 833 * An error happened trying the send back an ack to the server. 834 * Log an error and close the connection to this server. 835 */ 836 LocalizableMessageBuilder mb = new LocalizableMessageBuilder(); 837 mb.append(ERR_RS_ERROR_SENDING_ACK.get( 838 localReplicationServer.getServerId(), origServer.getServerId(), csn, baseDN)); 839 mb.append(" "); 840 mb.append(stackTraceToSingleLineString(e)); 841 logger.error(mb.toMessage()); 842 stopServer(origServer, false); 843 } 844 // Mark the ack info object as completed to prevent potential timeout 845 // code parallel run 846 expectedAcksInfo.completed(); 847 } 848 } 849 } 850 /* Else the timeout occurred for the update matching this CSN 851 * and the ack with timeout error has probably already been sent. 852 */ 853 } 854 855 /** 856 * The code run when the timeout occurs while waiting for acks of the 857 * eligible servers. This basically sends a timeout ack (with any additional 858 * error info) to the original server that sent an assured update message. 859 */ 860 private class AssuredTimeoutTask extends TimerTask 861 { 862 private CSN csn; 863 864 /** 865 * Constructor for the timer task. 866 * @param csn The CSN of the assured update we are waiting acks for 867 */ 868 public AssuredTimeoutTask(CSN csn) 869 { 870 this.csn = csn; 871 } 872 873 /** 874 * Run when the assured timeout for an assured update message we are waiting 875 * acks for occurs. 876 */ 877 @Override 878 public void run() 879 { 880 ExpectedAcksInfo expectedAcksInfo = waitingAcks.get(csn); 881 882 if (expectedAcksInfo != null) 883 { 884 synchronized (expectedAcksInfo) 885 { 886 if (expectedAcksInfo.isCompleted()) 887 { 888 // processAck() code is sending the ack, do nothing and let him 889 // remove object from the map 890 return; 891 } 892 // Remove the object from the map as no more needed 893 waitingAcks.remove(csn); 894 // Create the timeout ack and send him to the server the assured 895 // update message came from 896 AckMsg finalAck = expectedAcksInfo.createAck(true); 897 ServerHandler origServer = expectedAcksInfo.getRequesterServer(); 898 if (logger.isTraceEnabled()) 899 { 900 debug("sending timeout for assured update with CSN " + csn 901 + " to serverId=" + origServer.getServerId()); 902 } 903 try 904 { 905 origServer.send(finalAck); 906 } catch (IOException e) 907 { 908 /** 909 * An error happened trying the send back an ack to the server. 910 * Log an error and close the connection to this server. 911 */ 912 LocalizableMessageBuilder mb = new LocalizableMessageBuilder(); 913 mb.append(ERR_RS_ERROR_SENDING_ACK.get( 914 localReplicationServer.getServerId(), origServer.getServerId(), csn, baseDN)); 915 mb.append(" "); 916 mb.append(stackTraceToSingleLineString(e)); 917 logger.error(mb.toMessage()); 918 stopServer(origServer, false); 919 } 920 // Increment assured counters 921 boolean safeRead = 922 expectedAcksInfo instanceof SafeReadExpectedAcksInfo; 923 if (safeRead) 924 { 925 origServer.incrementAssuredSrReceivedUpdatesTimeout(); 926 } else 927 { 928 if (origServer.isDataServer()) 929 { 930 origServer.incrementAssuredSdReceivedUpdatesTimeout(); 931 } 932 } 933 // retrieve expected servers in timeout to increment their counter 934 List<Integer> serversInTimeout = expectedAcksInfo.getTimeoutServers(); 935 for (Integer serverId : serversInTimeout) 936 { 937 ServerHandler expectedDSInTimeout = connectedDSs.get(serverId); 938 ServerHandler expectedRSInTimeout = connectedRSs.get(serverId); 939 if (expectedDSInTimeout != null) 940 { 941 if (safeRead) 942 { 943 expectedDSInTimeout.incrementAssuredSrSentUpdatesTimeout(); 944 } // else no SD update sent to a DS (meaningless) 945 } else if (expectedRSInTimeout != null) 946 { 947 if (safeRead) 948 { 949 expectedRSInTimeout.incrementAssuredSrSentUpdatesTimeout(); 950 } 951 else 952 { 953 expectedRSInTimeout.incrementAssuredSdSentUpdatesTimeout(); 954 } 955 } 956 // else server disappeared ? Let's forget about it. 957 } 958 // Mark the ack info object as completed to prevent potential 959 // processAck() code parallel run 960 expectedAcksInfo.completed(); 961 } 962 } 963 } 964 } 965 966 967 /** 968 * Stop operations with a list of replication servers. 969 * 970 * @param serversToDisconnect 971 * the replication servers addresses for which we want to stop 972 * operations 973 */ 974 public void stopReplicationServers(Collection<HostPort> serversToDisconnect) 975 { 976 for (ReplicationServerHandler rsHandler : connectedRSs.values()) 977 { 978 if (serversToDisconnect.contains( 979 HostPort.valueOf(rsHandler.getServerAddressURL()))) 980 { 981 stopServer(rsHandler, false); 982 } 983 } 984 } 985 986 /** 987 * Stop operations with all servers this domain is connected with (RS and DS). 988 * 989 * @param shutdown A boolean indicating if the stop is due to a 990 * shutdown condition. 991 */ 992 public void stopAllServers(boolean shutdown) 993 { 994 for (ReplicationServerHandler rsHandler : connectedRSs.values()) 995 { 996 stopServer(rsHandler, shutdown); 997 } 998 999 for (DataServerHandler dsHandler : connectedDSs.values()) 1000 { 1001 stopServer(dsHandler, shutdown); 1002 } 1003 } 1004 1005 /** 1006 * Checks whether it is already connected to a DS with same id. 1007 * 1008 * @param dsHandler 1009 * the DS we want to check 1010 * @return true if this DS is already connected to the current server 1011 */ 1012 public boolean isAlreadyConnectedToDS(DataServerHandler dsHandler) 1013 { 1014 if (connectedDSs.containsKey(dsHandler.getServerId())) 1015 { 1016 // looks like two connected LDAP servers have the same serverId 1017 logger.error(ERR_DUPLICATE_SERVER_ID, localReplicationServer.getMonitorInstanceName(), 1018 connectedDSs.get(dsHandler.getServerId()), dsHandler, dsHandler.getServerId()); 1019 return true; 1020 } 1021 return false; 1022 } 1023 1024 /** 1025 * Stop operations with a given server. 1026 * 1027 * @param sHandler the server for which we want to stop operations. 1028 * @param shutdown A boolean indicating if the stop is due to a 1029 * shutdown condition. 1030 */ 1031 public void stopServer(ServerHandler sHandler, boolean shutdown) 1032 { 1033 // TODO JNR merge with stopServer(MessageHandler) 1034 if (logger.isTraceEnabled()) 1035 { 1036 debug("stopServer() on the server handler " + sHandler); 1037 } 1038 /* 1039 * We must prevent deadlock on replication server domain lock, when for 1040 * instance this code is called from dying ServerReader but also dying 1041 * ServerWriter at the same time, or from a thread that wants to shut down 1042 * the handler. So use a thread safe flag to know if the job must be done 1043 * or not (is already being processed or not). 1044 */ 1045 if (!sHandler.engageShutdown()) 1046 // Only do this once (prevent other thread to enter here again) 1047 { 1048 if (!shutdown) 1049 { 1050 try 1051 { 1052 // Acquire lock on domain (see more details in comment of start() 1053 // method of ServerHandler) 1054 lock(); 1055 } 1056 catch (InterruptedException ex) 1057 { 1058 // We can't deal with this here, so re-interrupt thread so that it is 1059 // caught during subsequent IO. 1060 Thread.currentThread().interrupt(); 1061 return; 1062 } 1063 } 1064 1065 try 1066 { 1067 // Stop useless monitoring publisher if no more RS or DS in domain 1068 if ( (connectedDSs.size() + connectedRSs.size() )== 1) 1069 { 1070 if (logger.isTraceEnabled()) 1071 { 1072 debug("remote server " + sHandler 1073 + " is the last RS/DS to be stopped:" 1074 + " stopping monitoring publisher"); 1075 } 1076 stopMonitoringPublisher(); 1077 } 1078 1079 if (connectedRSs.containsKey(sHandler.getServerId())) 1080 { 1081 unregisterServerHandler(sHandler, shutdown, false); 1082 } 1083 else if (connectedDSs.containsKey(sHandler.getServerId())) 1084 { 1085 unregisterServerHandler(sHandler, shutdown, true); 1086 } 1087 } 1088 catch(Exception e) 1089 { 1090 logger.error(LocalizableMessage.raw(stackTraceToSingleLineString(e))); 1091 } 1092 finally 1093 { 1094 if (!shutdown) 1095 { 1096 release(); 1097 } 1098 } 1099 } 1100 } 1101 1102 private void unregisterServerHandler(ServerHandler sHandler, boolean shutdown, 1103 boolean isDirectoryServer) 1104 { 1105 unregisterServerHandler(sHandler); 1106 sHandler.shutdown(); 1107 1108 resetGenerationIdIfPossible(); 1109 if (!shutdown) 1110 { 1111 synchronized (pendingStatusMessagesLock) 1112 { 1113 if (isDirectoryServer) 1114 { 1115 // Update the remote replication servers with our list 1116 // of connected LDAP servers 1117 pendingStatusMessages.enqueueTopoInfoToAllRSs(); 1118 } 1119 // Warn our DSs that a RS or DS has quit (does not use this 1120 // handler as already removed from list) 1121 pendingStatusMessages.enqueueTopoInfoToAllDSsExcept(null); 1122 } 1123 statusAnalyzer.notifyPendingStatusMessage(); 1124 } 1125 } 1126 1127 /** 1128 * Unregister this handler from the list of handlers registered to this 1129 * domain. 1130 * @param sHandler the provided handler to unregister. 1131 */ 1132 private void unregisterServerHandler(ServerHandler sHandler) 1133 { 1134 if (sHandler.isReplicationServer()) 1135 { 1136 connectedRSs.remove(sHandler.getServerId()); 1137 } 1138 else 1139 { 1140 connectedDSs.remove(sHandler.getServerId()); 1141 } 1142 } 1143 1144 /** 1145 * This method resets the generationId for this domain if there is no LDAP 1146 * server currently connected in the whole topology on this domain and if the 1147 * generationId has never been saved. 1148 * <ul> 1149 * <li>test emptiness of {@link #connectedDSs} list</li> 1150 * <li>traverse {@link #connectedRSs} list and test for each if DS are 1151 * connected</li> 1152 * </ul> 1153 * So it strongly relies on the {@link #connectedDSs} list 1154 */ 1155 private void resetGenerationIdIfPossible() 1156 { 1157 if (logger.isTraceEnabled()) 1158 { 1159 debug("mayResetGenerationId generationIdSavedStatus=" 1160 + generationIdSavedStatus); 1161 } 1162 1163 // If there is no more any LDAP server connected to this domain in the 1164 // topology and the generationId has never been saved, then we can reset 1165 // it and the next LDAP server to connect will become the new reference. 1166 boolean ldapServersConnectedInTheTopology = false; 1167 if (connectedDSs.isEmpty()) 1168 { 1169 for (ReplicationServerHandler rsHandler : connectedRSs.values()) 1170 { 1171 if (generationId != rsHandler.getGenerationId()) 1172 { 1173 if (logger.isTraceEnabled()) 1174 { 1175 debug("mayResetGenerationId skip RS " + rsHandler 1176 + " that has different genId"); 1177 } 1178 } 1179 else if (rsHandler.hasRemoteLDAPServers()) 1180 { 1181 ldapServersConnectedInTheTopology = true; 1182 1183 if (logger.isTraceEnabled()) 1184 { 1185 debug("mayResetGenerationId RS " + rsHandler 1186 + " has ldap servers connected to it" 1187 + " - will not reset generationId"); 1188 } 1189 break; 1190 } 1191 } 1192 } 1193 else 1194 { 1195 ldapServersConnectedInTheTopology = true; 1196 1197 if (logger.isTraceEnabled()) 1198 { 1199 debug("has ldap servers connected to it - will not reset generationId"); 1200 } 1201 } 1202 1203 if (!ldapServersConnectedInTheTopology 1204 && !generationIdSavedStatus 1205 && generationId != -1) 1206 { 1207 changeGenerationId(-1); 1208 } 1209 } 1210 1211 /** 1212 * Checks whether a remote RS is already connected to this hosting RS. 1213 * 1214 * @param rsHandler 1215 * The handler for the remote RS. 1216 * @return flag specifying whether the remote RS is already connected. 1217 * @throws DirectoryException 1218 * when a problem occurs. 1219 */ 1220 public boolean isAlreadyConnectedToRS(ReplicationServerHandler rsHandler) 1221 throws DirectoryException 1222 { 1223 ReplicationServerHandler oldRsHandler = 1224 connectedRSs.get(rsHandler.getServerId()); 1225 if (oldRsHandler == null) 1226 { 1227 return false; 1228 } 1229 1230 if (oldRsHandler.getServerAddressURL().equals( 1231 rsHandler.getServerAddressURL())) 1232 { 1233 // this is the same server, this means that our ServerStart messages 1234 // have been sent at about the same time and 2 connections 1235 // have been established. 1236 // Silently drop this connection. 1237 return true; 1238 } 1239 1240 // looks like two replication servers have the same serverId 1241 // log an error message and drop this connection. 1242 LocalizableMessage message = ERR_DUPLICATE_REPLICATION_SERVER_ID.get( 1243 localReplicationServer.getMonitorInstanceName(), 1244 oldRsHandler.getServerAddressURL(), rsHandler.getServerAddressURL(), 1245 rsHandler.getServerId()); 1246 throw new DirectoryException(ResultCode.OTHER, message); 1247 } 1248 1249 /** 1250 * Creates and returns a cursor across this replication domain. 1251 * <p> 1252 * Client code must call {@link DBCursor#next()} to advance the cursor to the 1253 * next available record. 1254 * <p> 1255 * When the cursor is not used anymore, client code MUST call the 1256 * {@link DBCursor#close()} method to free the resources and locks used by the 1257 * cursor. 1258 * 1259 * @param startAfterServerState 1260 * Starting point for the replicaDB cursors. If null, start from the 1261 * oldest CSN 1262 * @return a non null {@link DBCursor} going from oldest to newest CSN 1263 * @throws ChangelogException 1264 * If a database problem happened 1265 * @see ReplicationDomainDB#getCursorFrom(DN, ServerState, CursorOptions) 1266 */ 1267 public DBCursor<UpdateMsg> getCursorFrom(ServerState startAfterServerState) 1268 throws ChangelogException 1269 { 1270 CursorOptions options = new CursorOptions(GREATER_THAN_OR_EQUAL_TO_KEY, AFTER_MATCHING_KEY); 1271 return domainDB.getCursorFrom(baseDN, startAfterServerState, options); 1272 } 1273 1274 /** 1275 * Get the baseDN. 1276 * 1277 * @return Returns the baseDN. 1278 */ 1279 public DN getBaseDN() 1280 { 1281 return baseDN; 1282 } 1283 1284 /** 1285 * Retrieves the destination handlers for a routable message. 1286 * 1287 * @param msg The message to route. 1288 * @param senderHandler The handler of the server that published this message. 1289 * @return The list of destination handlers. 1290 */ 1291 private List<ServerHandler> getDestinationServers(RoutableMsg msg, 1292 ServerHandler senderHandler) 1293 { 1294 List<ServerHandler> servers = new ArrayList<>(); 1295 1296 if (msg.getDestination() == RoutableMsg.THE_CLOSEST_SERVER) 1297 { 1298 // TODO Import from the "closest server" to be implemented 1299 } else if (msg.getDestination() == RoutableMsg.ALL_SERVERS) 1300 { 1301 if (!senderHandler.isReplicationServer()) 1302 { 1303 // Send to all replication servers with a least one remote 1304 // server connected 1305 for (ReplicationServerHandler rsh : connectedRSs.values()) 1306 { 1307 if (rsh.hasRemoteLDAPServers()) 1308 { 1309 servers.add(rsh); 1310 } 1311 } 1312 } 1313 1314 // Sends to all connected LDAP servers 1315 for (DataServerHandler destinationHandler : connectedDSs.values()) 1316 { 1317 // Don't loop on the sender 1318 if (destinationHandler == senderHandler) 1319 { 1320 continue; 1321 } 1322 servers.add(destinationHandler); 1323 } 1324 } else 1325 { 1326 // Destination is one server 1327 DataServerHandler destinationHandler = 1328 connectedDSs.get(msg.getDestination()); 1329 if (destinationHandler != null) 1330 { 1331 servers.add(destinationHandler); 1332 } else 1333 { 1334 // the targeted server is NOT connected 1335 // Let's search for the replication server that MAY 1336 // have the targeted server connected. 1337 if (senderHandler.isDataServer()) 1338 { 1339 for (ReplicationServerHandler rsHandler : connectedRSs.values()) 1340 { 1341 // Send to all replication servers with a least one remote 1342 // server connected 1343 if (rsHandler.isRemoteLDAPServer(msg.getDestination())) 1344 { 1345 servers.add(rsHandler); 1346 } 1347 } 1348 } 1349 } 1350 } 1351 return servers; 1352 } 1353 1354 1355 1356 /** 1357 * Processes a message coming from one server in the topology and potentially 1358 * forwards it to one or all other servers. 1359 * 1360 * @param msg 1361 * The message received and to be processed. 1362 * @param sender 1363 * The server handler of the server that sent the message. 1364 */ 1365 void process(RoutableMsg msg, ServerHandler sender) 1366 { 1367 if (msg.getDestination() == localReplicationServer.getServerId()) 1368 { 1369 // Handle routable messages targeted at this RS. 1370 if (msg instanceof ErrorMsg) 1371 { 1372 ErrorMsg errorMsg = (ErrorMsg) msg; 1373 logger.error(ERR_ERROR_MSG_RECEIVED, errorMsg.getDetails()); 1374 } 1375 else 1376 { 1377 replyWithUnroutableMsgType(sender, msg); 1378 } 1379 } 1380 else 1381 { 1382 // Forward message not destined for this RS. 1383 List<ServerHandler> servers = getDestinationServers(msg, sender); 1384 if (!servers.isEmpty()) 1385 { 1386 forwardMsgToAllServers(msg, servers, sender); 1387 } 1388 else 1389 { 1390 replyWithUnreachablePeerMsg(sender, msg); 1391 } 1392 } 1393 } 1394 1395 /** 1396 * Responds to a monitor request message. 1397 * 1398 * @param msg 1399 * The monitor request message. 1400 * @param sender 1401 * The DS/RS which sent the monitor request. 1402 */ 1403 void processMonitorRequestMsg(MonitorRequestMsg msg, ServerHandler sender) 1404 { 1405 enqueueMonitorMsg(msg, sender); 1406 } 1407 1408 /** 1409 * Responds to a monitor message. 1410 * 1411 * @param msg 1412 * The monitor message 1413 * @param sender 1414 * The DS/RS which sent the monitor. 1415 */ 1416 void processMonitorMsg(MonitorMsg msg, ServerHandler sender) 1417 { 1418 domainMonitor.receiveMonitorDataResponse(msg, sender.getServerId()); 1419 } 1420 1421 private void replyWithUnroutableMsgType(ServerHandler msgEmitter, 1422 RoutableMsg msg) 1423 { 1424 String msgClassname = msg.getClass().getCanonicalName(); 1425 logger.info(NOTE_ERR_ROUTING_TO_SERVER, msgClassname); 1426 1427 LocalizableMessageBuilder mb = new LocalizableMessageBuilder(); 1428 mb.append(NOTE_ERR_ROUTING_TO_SERVER.get(msgClassname)); 1429 mb.append("serverID:").append(msg.getDestination()); 1430 ErrorMsg errMsg = new ErrorMsg(msg.getSenderID(), mb.toMessage()); 1431 try 1432 { 1433 msgEmitter.send(errMsg); 1434 } 1435 catch (IOException ignored) 1436 { 1437 // an error happened on the sender session trying to recover 1438 // from an error on the receiver session. 1439 // Not much more we can do at this point. 1440 } 1441 } 1442 1443 private void replyWithUnreachablePeerMsg(ServerHandler msgEmitter, 1444 RoutableMsg msg) 1445 { 1446 LocalizableMessageBuilder mb = new LocalizableMessageBuilder(); 1447 mb.append(ERR_NO_REACHABLE_PEER_IN_THE_DOMAIN.get(baseDN, msg.getDestination())); 1448 mb.append(" In Replication Server=").append( 1449 this.localReplicationServer.getMonitorInstanceName()); 1450 mb.append(" unroutable message =").append(msg.getClass().getSimpleName()); 1451 mb.append(" Details:routing table is empty"); 1452 final LocalizableMessage message = mb.toMessage(); 1453 logger.error(message); 1454 1455 ErrorMsg errMsg = new ErrorMsg(this.localReplicationServer.getServerId(), 1456 msg.getSenderID(), message); 1457 try 1458 { 1459 msgEmitter.send(errMsg); 1460 } 1461 catch (IOException ignored) 1462 { 1463 // TODO Handle error properly (sender timeout in addition) 1464 /* 1465 * An error happened trying to send an error msg to this server. 1466 * Log an error and close the connection to this server. 1467 */ 1468 logger.error(ERR_CHANGELOG_ERROR_SENDING_ERROR, this, ignored); 1469 stopServer(msgEmitter, false); 1470 } 1471 } 1472 1473 private void forwardMsgToAllServers(RoutableMsg msg, 1474 List<ServerHandler> servers, ServerHandler sender) 1475 { 1476 for (ServerHandler targetHandler : servers) 1477 { 1478 try 1479 { 1480 targetHandler.send(msg); 1481 } catch (IOException ioe) 1482 { 1483 /* 1484 * An error happened trying to send a routable message to its 1485 * destination server. 1486 * Send back an error to the originator of the message. 1487 */ 1488 LocalizableMessageBuilder mb = new LocalizableMessageBuilder(); 1489 mb.append(ERR_NO_REACHABLE_PEER_IN_THE_DOMAIN.get(baseDN, msg.getDestination())); 1490 mb.append(" unroutable message =").append(msg.getClass().getSimpleName()); 1491 mb.append(" Details: ").append(ioe.getLocalizedMessage()); 1492 final LocalizableMessage message = mb.toMessage(); 1493 logger.error(message); 1494 1495 ErrorMsg errMsg = new ErrorMsg(msg.getSenderID(), message); 1496 try 1497 { 1498 sender.send(errMsg); 1499 } catch (IOException ioe1) 1500 { 1501 // an error happened on the sender session trying to recover 1502 // from an error on the receiver session. 1503 // We don't have much solution left beside closing the sessions. 1504 stopServer(sender, false); 1505 stopServer(targetHandler, false); 1506 } 1507 // TODO Handle error properly (sender timeout in addition) 1508 } 1509 } 1510 } 1511 1512 /** 1513 * Creates a new monitor message including monitoring information for the 1514 * whole topology. 1515 * 1516 * @param sender 1517 * The sender of this message. 1518 * @param destination 1519 * The destination of this message. 1520 * @return The newly created and filled MonitorMsg. Null if a problem occurred 1521 * during message creation. 1522 * @throws InterruptedException 1523 * if this thread is interrupted while waiting for a response 1524 */ 1525 public MonitorMsg createGlobalTopologyMonitorMsg(int sender, int destination) 1526 throws InterruptedException 1527 { 1528 return createGlobalTopologyMonitorMsg(sender, destination, 1529 domainMonitor.recomputeMonitorData()); 1530 } 1531 1532 private MonitorMsg createGlobalTopologyMonitorMsg(int sender, 1533 int destination, ReplicationDomainMonitorData monitorData) 1534 { 1535 final MonitorMsg returnMsg = new MonitorMsg(sender, destination); 1536 returnMsg.setReplServerDbState(getLatestServerState()); 1537 1538 // Add the server state for each DS and RS currently in the topology. 1539 for (int replicaId : toIterable(monitorData.ldapIterator())) 1540 { 1541 returnMsg.setServerState(replicaId, 1542 monitorData.getLDAPServerState(replicaId), 1543 monitorData.getApproxFirstMissingDate(replicaId), true); 1544 } 1545 1546 for (int replicaId : toIterable(monitorData.rsIterator())) 1547 { 1548 returnMsg.setServerState(replicaId, 1549 monitorData.getRSStates(replicaId), 1550 monitorData.getRSApproxFirstMissingDate(replicaId), false); 1551 } 1552 1553 return returnMsg; 1554 } 1555 1556 1557 1558 /** 1559 * Creates a new monitor message including monitoring information for the 1560 * topology directly connected to this RS. This includes information for: - 1561 * local RS - all direct DSs - all direct RSs 1562 * 1563 * @param sender 1564 * The sender of this message. 1565 * @param destination 1566 * The destination of this message. 1567 * @return The newly created and filled MonitorMsg. Null if the current thread 1568 * was interrupted while attempting to get the domain lock. 1569 */ 1570 private MonitorMsg createLocalTopologyMonitorMsg(int sender, int destination) 1571 { 1572 final MonitorMsg monitorMsg = new MonitorMsg(sender, destination); 1573 monitorMsg.setReplServerDbState(getLatestServerState()); 1574 1575 // Add the server state for each connected DS and RS. 1576 for (DataServerHandler dsHandler : this.connectedDSs.values()) 1577 { 1578 monitorMsg.setServerState(dsHandler.getServerId(), 1579 dsHandler.getServerState(), dsHandler.getApproxFirstMissingDate(), 1580 true); 1581 } 1582 1583 for (ReplicationServerHandler rsHandler : this.connectedRSs.values()) 1584 { 1585 monitorMsg.setServerState(rsHandler.getServerId(), 1586 rsHandler.getServerState(), rsHandler.getApproxFirstMissingDate(), 1587 false); 1588 } 1589 return monitorMsg; 1590 } 1591 1592 /** 1593 * Shutdown this ReplicationServerDomain. 1594 */ 1595 public void shutdown() 1596 { 1597 DirectoryServer.deregisterMonitorProvider(this); 1598 1599 // Terminate the assured timer 1600 assuredTimeoutTimer.cancel(); 1601 1602 stopAllServers(true); 1603 statusAnalyzer.shutdown(); 1604 } 1605 1606 /** 1607 * Returns the latest most current ServerState describing the newest CSNs for 1608 * each server in this domain. 1609 * 1610 * @return The ServerState describing the newest CSNs for each server in in 1611 * this domain. 1612 */ 1613 public ServerState getLatestServerState() 1614 { 1615 return domainDB.getDomainNewestCSNs(baseDN); 1616 } 1617 1618 /** {@inheritDoc} */ 1619 @Override 1620 public String toString() 1621 { 1622 return "ReplicationServerDomain " + baseDN; 1623 } 1624 1625 1626 1627 /** 1628 * Creates a TopologyMsg filled with information to be sent to a remote RS. 1629 * We send remote RS the info of every DS that are directly connected to us 1630 * plus our own info as RS. 1631 * @return A suitable TopologyMsg PDU to be sent to a peer RS 1632 */ 1633 public TopologyMsg createTopologyMsgForRS() 1634 { 1635 List<DSInfo> dsInfos = new ArrayList<>(); 1636 for (DataServerHandler dsHandler : connectedDSs.values()) 1637 { 1638 dsInfos.add(dsHandler.toDSInfo()); 1639 } 1640 1641 // Create info for the local RS 1642 List<RSInfo> rsInfos = newArrayList(toRSInfo(localReplicationServer, generationId)); 1643 1644 return new TopologyMsg(dsInfos, rsInfos); 1645 } 1646 1647 /** 1648 * Creates a TopologyMsg filled with information to be sent to a DS. 1649 * We send remote DS the info of every known DS and RS in the topology (our 1650 * directly connected DSs plus the DSs connected to other RSs) except himself. 1651 * Also put info related to local RS. 1652 * 1653 * @param destDsId The id of the DS the TopologyMsg PDU is to be sent to and 1654 * that we must not include in the DS list. 1655 * @return A suitable TopologyMsg PDU to be sent to a peer DS 1656 */ 1657 public TopologyMsg createTopologyMsgForDS(int destDsId) 1658 { 1659 // Go through every DSs (except recipient of msg) 1660 List<DSInfo> dsInfos = new ArrayList<>(); 1661 for (DataServerHandler dsHandler : connectedDSs.values()) 1662 { 1663 if (dsHandler.getServerId() == destDsId) 1664 { 1665 continue; 1666 } 1667 dsInfos.add(dsHandler.toDSInfo()); 1668 } 1669 1670 1671 List<RSInfo> rsInfos = new ArrayList<>(); 1672 // Add our own info (local RS) 1673 rsInfos.add(toRSInfo(localReplicationServer, generationId)); 1674 1675 // Go through every peer RSs (and get their connected DSs), also add info 1676 // for RSs 1677 for (ReplicationServerHandler rsHandler : connectedRSs.values()) 1678 { 1679 rsInfos.add(rsHandler.toRSInfo()); 1680 1681 rsHandler.addDSInfos(dsInfos); 1682 } 1683 1684 return new TopologyMsg(dsInfos, rsInfos); 1685 } 1686 1687 private RSInfo toRSInfo(ReplicationServer rs, long generationId) 1688 { 1689 return new RSInfo(rs.getServerId(), rs.getServerURL(), generationId, 1690 rs.getGroupId(), rs.getWeight()); 1691 } 1692 1693 /** 1694 * Get the generationId associated to this domain. 1695 * 1696 * @return The generationId 1697 */ 1698 public long getGenerationId() 1699 { 1700 return generationId; 1701 } 1702 1703 /** 1704 * Initialize the value of the generationID for this ReplicationServerDomain. 1705 * This method is intended to be used for initialization at startup and 1706 * simply stores the new value without any additional processing. 1707 * For example it does not clear the change-log DBs 1708 * 1709 * @param generationId The new value of generationId. 1710 */ 1711 public void initGenerationID(long generationId) 1712 { 1713 synchronized (generationIDLock) 1714 { 1715 this.generationId = generationId; 1716 this.generationIdSavedStatus = true; 1717 } 1718 } 1719 1720 /** 1721 * Sets the provided value as the new in memory generationId. 1722 * Also clear the changelog databases. 1723 * 1724 * @param generationId The new value of generationId. 1725 * @return The old generation id 1726 */ 1727 public long changeGenerationId(long generationId) 1728 { 1729 synchronized (generationIDLock) 1730 { 1731 long oldGenerationId = this.generationId; 1732 1733 if (this.generationId != generationId) 1734 { 1735 clearDbs(); 1736 1737 this.generationId = generationId; 1738 this.generationIdSavedStatus = false; 1739 } 1740 return oldGenerationId; 1741 } 1742 } 1743 1744 /** 1745 * Resets the generationID. 1746 * 1747 * @param senderHandler The handler associated to the server 1748 * that requested to reset the generationId. 1749 * @param genIdMsg The reset generation ID msg received. 1750 */ 1751 public void resetGenerationId(ServerHandler senderHandler, 1752 ResetGenerationIdMsg genIdMsg) 1753 { 1754 if (logger.isTraceEnabled()) 1755 { 1756 debug("Receiving ResetGenerationIdMsg from " 1757 + senderHandler.getServerId() + ":\n" + genIdMsg); 1758 } 1759 1760 try 1761 { 1762 // Acquire lock on domain (see more details in comment of start() method 1763 // of ServerHandler) 1764 lock(); 1765 } 1766 catch (InterruptedException ex) 1767 { 1768 // We can't deal with this here, so re-interrupt thread so that it is 1769 // caught during subsequent IO. 1770 Thread.currentThread().interrupt(); 1771 return; 1772 } 1773 1774 try 1775 { 1776 final long newGenId = genIdMsg.getGenerationId(); 1777 if (newGenId != this.generationId) 1778 { 1779 changeGenerationId(newGenId); 1780 } 1781 else 1782 { 1783 // Order to take a gen id we already have, just ignore 1784 if (logger.isTraceEnabled()) 1785 { 1786 debug("Reset generation id requested but generationId was already " 1787 + this.generationId + ":\n" + genIdMsg); 1788 } 1789 } 1790 1791 // If we are the first replication server warned, 1792 // then forwards the reset message to the remote replication servers 1793 for (ServerHandler rsHandler : connectedRSs.values()) 1794 { 1795 try 1796 { 1797 // After we'll have sent the message , the remote RS will adopt 1798 // the new genId 1799 rsHandler.setGenerationId(newGenId); 1800 if (senderHandler.isDataServer()) 1801 { 1802 rsHandler.send(genIdMsg); 1803 } 1804 } catch (IOException e) 1805 { 1806 logger.error(ERR_EXCEPTION_FORWARDING_RESET_GEN_ID, baseDN, e.getMessage()); 1807 } 1808 } 1809 1810 // Change status of the connected DSs according to the requested new 1811 // reference generation id 1812 for (DataServerHandler dsHandler : connectedDSs.values()) 1813 { 1814 try 1815 { 1816 dsHandler.changeStatusForResetGenId(newGenId); 1817 } catch (IOException e) 1818 { 1819 logger.error(ERR_EXCEPTION_CHANGING_STATUS_AFTER_RESET_GEN_ID, baseDN, 1820 dsHandler.getServerId(), e.getMessage()); 1821 } 1822 } 1823 1824 // Update every peers (RS/DS) with potential topology changes (status 1825 // change). Rather than doing that each time a DS has a status change 1826 // (consecutive to reset gen id message), we prefer advertising once for 1827 // all after changes (less packet sent), here at the end of the reset msg 1828 // treatment. 1829 sendTopoInfoToAll(); 1830 1831 logger.info(NOTE_RESET_GENERATION_ID, baseDN, newGenId); 1832 } 1833 catch(Exception e) 1834 { 1835 logger.error(LocalizableMessage.raw(stackTraceToSingleLineString(e))); 1836 } 1837 finally 1838 { 1839 release(); 1840 } 1841 } 1842 1843 /** 1844 * Process message of a remote server changing his status. 1845 * @param senderHandler The handler associated to the server 1846 * that changed his status. 1847 * @param csMsg The message containing the new status 1848 */ 1849 public void processNewStatus(DataServerHandler senderHandler, 1850 ChangeStatusMsg csMsg) 1851 { 1852 if (logger.isTraceEnabled()) 1853 { 1854 debug("receiving ChangeStatusMsg from " + senderHandler.getServerId() 1855 + ":\n" + csMsg); 1856 } 1857 1858 try 1859 { 1860 // Acquire lock on domain (see more details in comment of start() method 1861 // of ServerHandler) 1862 lock(); 1863 } 1864 catch (InterruptedException ex) 1865 { 1866 // We can't deal with this here, so re-interrupt thread so that it is 1867 // caught during subsequent IO. 1868 Thread.currentThread().interrupt(); 1869 return; 1870 } 1871 1872 try 1873 { 1874 ServerStatus newStatus = senderHandler.processNewStatus(csMsg); 1875 if (newStatus == ServerStatus.INVALID_STATUS) 1876 { 1877 // Already logged an error in processNewStatus() 1878 // just return not to forward a bad status to topology 1879 return; 1880 } 1881 1882 enqueueTopoInfoToAllExcept(senderHandler); 1883 1884 logger.info(NOTE_DIRECTORY_SERVER_CHANGED_STATUS, 1885 senderHandler.getServerId(), baseDN, newStatus); 1886 } 1887 catch(Exception e) 1888 { 1889 logger.error(LocalizableMessage.raw(stackTraceToSingleLineString(e))); 1890 } 1891 finally 1892 { 1893 release(); 1894 } 1895 } 1896 1897 /** 1898 * Change the status of a directory server according to the event generated 1899 * from the status analyzer. 1900 * @param dsHandler The handler of the directory server to update 1901 * @param event The event to be used for new status computation 1902 * @return True if we have been interrupted (must stop), false otherwise 1903 */ 1904 private boolean changeStatus(DataServerHandler dsHandler, 1905 StatusMachineEvent event) 1906 { 1907 try 1908 { 1909 // Acquire lock on domain (see ServerHandler#start() for more details) 1910 lock(); 1911 } 1912 catch (InterruptedException ex) 1913 { 1914 // We have been interrupted for dying, from stopStatusAnalyzer 1915 // to prevent deadlock in this situation: 1916 // RS is being shutdown, and stopServer will call stopStatusAnalyzer. 1917 // Domain lock is taken by shutdown thread while status analyzer thread 1918 // is willing to change the status of a server at the same time so is 1919 // waiting for the domain lock at the same time. As shutdown thread is 1920 // waiting for analyzer thread death, a deadlock occurs. So we force 1921 // interruption of the status analyzer thread death after 2 seconds if 1922 // it has not finished (see StatusAnalyzer.waitForShutdown). This allows 1923 // to have the analyzer thread taking the domain lock only when the 1924 // status of a DS has to be changed. See more comments in run method of 1925 // StatusAnalyzer. 1926 if (logger.isTraceEnabled()) 1927 { 1928 logger.trace("Status analyzer for domain " + baseDN 1929 + " has been interrupted when" 1930 + " trying to acquire domain lock for changing the status of DS " 1931 + dsHandler.getServerId()); 1932 } 1933 return true; 1934 } 1935 1936 try 1937 { 1938 ServerStatus newStatus = ServerStatus.INVALID_STATUS; 1939 ServerStatus oldStatus = dsHandler.getStatus(); 1940 try 1941 { 1942 newStatus = dsHandler.changeStatus(event); 1943 } 1944 catch (IOException e) 1945 { 1946 logger.error(ERR_EXCEPTION_CHANGING_STATUS_FROM_STATUS_ANALYZER, 1947 baseDN, dsHandler.getServerId(), e.getMessage()); 1948 } 1949 1950 if (newStatus == ServerStatus.INVALID_STATUS || newStatus == oldStatus) 1951 { 1952 // Change was impossible or already occurred (see StatusAnalyzer 1953 // comments) 1954 return false; 1955 } 1956 1957 enqueueTopoInfoToAllExcept(dsHandler); 1958 } 1959 catch (Exception e) 1960 { 1961 logger.error(LocalizableMessage.raw(stackTraceToSingleLineString(e))); 1962 } 1963 finally 1964 { 1965 release(); 1966 } 1967 1968 return false; 1969 } 1970 1971 /** 1972 * Update every peers (RS/DS) with topology changes. 1973 */ 1974 public void sendTopoInfoToAll() 1975 { 1976 enqueueTopoInfoToAllExcept(null); 1977 } 1978 1979 /** 1980 * Update every peers (RS/DS) with topology changes but one DS. 1981 * 1982 * @param dsHandler 1983 * if not null, the topology message will not be sent to this DS 1984 */ 1985 private void enqueueTopoInfoToAllExcept(DataServerHandler dsHandler) 1986 { 1987 synchronized (pendingStatusMessagesLock) 1988 { 1989 pendingStatusMessages.enqueueTopoInfoToAllDSsExcept(dsHandler); 1990 pendingStatusMessages.enqueueTopoInfoToAllRSs(); 1991 } 1992 statusAnalyzer.notifyPendingStatusMessage(); 1993 } 1994 1995 /** 1996 * Clears the Db associated with that domain. 1997 */ 1998 private void clearDbs() 1999 { 2000 try 2001 { 2002 domainDB.removeDomain(baseDN); 2003 } 2004 catch (ChangelogException e) 2005 { 2006 logger.error(ERR_ERROR_CLEARING_DB, baseDN, e.getMessage(), e); 2007 } 2008 } 2009 2010 /** 2011 * Returns whether the provided server is in degraded 2012 * state due to the fact that the peer server has an invalid 2013 * generationId for this domain. 2014 * 2015 * @param serverId The serverId for which we want to know the 2016 * the state. 2017 * @return Whether it is degraded or not. 2018 */ 2019 public boolean isDegradedDueToGenerationId(int serverId) 2020 { 2021 if (logger.isTraceEnabled()) 2022 { 2023 debug("isDegraded serverId=" + serverId + " given local generation Id=" 2024 + this.generationId); 2025 } 2026 2027 ServerHandler sHandler = connectedRSs.get(serverId); 2028 if (sHandler == null) 2029 { 2030 sHandler = connectedDSs.get(serverId); 2031 if (sHandler == null) 2032 { 2033 return false; 2034 } 2035 } 2036 2037 if (logger.isTraceEnabled()) 2038 { 2039 debug("Compute degradation of serverId=" + serverId 2040 + " LS server generation Id=" + sHandler.getGenerationId()); 2041 } 2042 return sHandler.getGenerationId() != this.generationId; 2043 } 2044 2045 /** 2046 * Process topology information received from a peer RS. 2047 * @param topoMsg The just received topo message from remote RS 2048 * @param rsHandler The handler that received the message. 2049 * @param allowResetGenId True for allowing to reset the generation id ( 2050 * when called after initial handshake) 2051 * @throws IOException If an error occurred. 2052 * @throws DirectoryException If an error occurred. 2053 */ 2054 public void receiveTopoInfoFromRS(TopologyMsg topoMsg, 2055 ReplicationServerHandler rsHandler, boolean allowResetGenId) 2056 throws IOException, DirectoryException 2057 { 2058 if (logger.isTraceEnabled()) 2059 { 2060 debug("receiving TopologyMsg from serverId=" + rsHandler.getServerId() 2061 + ":\n" + topoMsg); 2062 } 2063 2064 try 2065 { 2066 // Acquire lock on domain (see more details in comment of start() method 2067 // of ServerHandler) 2068 lock(); 2069 } 2070 catch (InterruptedException ex) 2071 { 2072 // We can't deal with this here, so re-interrupt thread so that it is 2073 // caught during subsequent IO. 2074 Thread.currentThread().interrupt(); 2075 return; 2076 } 2077 2078 try 2079 { 2080 // Store DS connected to remote RS & update information about the peer RS 2081 rsHandler.processTopoInfoFromRS(topoMsg); 2082 2083 // Handle generation id 2084 if (allowResetGenId) 2085 { 2086 resetGenerationIdIfPossible(); 2087 setGenerationIdIfUnset(rsHandler.getGenerationId()); 2088 } 2089 2090 if (isDifferentGenerationId(rsHandler.getGenerationId())) 2091 { 2092 LocalizableMessage message = WARN_BAD_GENERATION_ID_FROM_RS.get(rsHandler.getServerId(), 2093 rsHandler.session.getReadableRemoteAddress(), rsHandler.getGenerationId(), 2094 baseDN, getLocalRSServerId(), generationId); 2095 logger.warn(message); 2096 2097 ErrorMsg errorMsg = new ErrorMsg(getLocalRSServerId(), 2098 rsHandler.getServerId(), message); 2099 rsHandler.send(errorMsg); 2100 } 2101 2102 /* 2103 * Sends the currently known topology information to every connected 2104 * DS we have. 2105 */ 2106 synchronized (pendingStatusMessagesLock) 2107 { 2108 pendingStatusMessages.enqueueTopoInfoToAllDSsExcept(null); 2109 } 2110 statusAnalyzer.notifyPendingStatusMessage(); 2111 } 2112 catch(Exception e) 2113 { 2114 logger.error(LocalizableMessage.raw(stackTraceToSingleLineString(e))); 2115 } 2116 finally 2117 { 2118 release(); 2119 } 2120 } 2121 2122 private void setGenerationIdIfUnset(long generationId) 2123 { 2124 if (this.generationId < 0) 2125 { 2126 this.generationId = generationId; 2127 } 2128 } 2129 2130 /** 2131 * Returns the latest monitor data available for this replication server 2132 * domain. 2133 * 2134 * @return The latest monitor data available for this replication server 2135 * domain, which is never {@code null}. 2136 */ 2137 ReplicationDomainMonitorData getDomainMonitorData() 2138 { 2139 return domainMonitor.getMonitorData(); 2140 } 2141 2142 /** 2143 * Get the map of connected DSs. 2144 * @return The map of connected DSs 2145 */ 2146 public Map<Integer, DataServerHandler> getConnectedDSs() 2147 { 2148 return Collections.unmodifiableMap(connectedDSs); 2149 } 2150 2151 /** 2152 * Get the map of connected RSs. 2153 * @return The map of connected RSs 2154 */ 2155 public Map<Integer, ReplicationServerHandler> getConnectedRSs() 2156 { 2157 return Collections.unmodifiableMap(connectedRSs); 2158 } 2159 2160 2161 /** 2162 * A synchronization mechanism is created to insure exclusive access to the 2163 * domain. The goal is to have a consistent view of the topology by locking 2164 * the structures holding the topology view of the domain: 2165 * {@link #connectedDSs} and {@link #connectedRSs}. When a connection is 2166 * established with a peer DS or RS, the lock should be taken before updating 2167 * these structures, then released. The same mechanism should be used when 2168 * updating any data related to the view of the topology: for instance if the 2169 * status of a DS is changed, the lock should be taken before updating the 2170 * matching server handler and sending the topology messages to peers and 2171 * released after.... This allows every member of the topology to have a 2172 * consistent view of the topology and to be sure it will not miss some 2173 * information. 2174 * <p> 2175 * So the locking system must be called (not exhaustive list): 2176 * <ul> 2177 * <li>when connection established with a DS or RS</li> 2178 * <li>when connection ended with a DS or RS</li> 2179 * <li>when receiving a TopologyMsg and updating structures</li> 2180 * <li>when creating and sending a TopologyMsg</li> 2181 * <li>when a DS status is changing (ChangeStatusMsg received or sent)...</li> 2182 * </ul> 2183 */ 2184 private final ReentrantLock lock = new ReentrantLock(); 2185 2186 /** 2187 * This lock is used to protect the generationId variable. 2188 */ 2189 private final Object generationIDLock = new Object(); 2190 2191 /** 2192 * Tests if the current thread has the lock on this domain. 2193 * @return True if the current thread has the lock. 2194 */ 2195 public boolean hasLock() 2196 { 2197 return lock.getHoldCount() > 0; 2198 } 2199 2200 /** 2201 * Takes the lock on this domain (blocking until lock can be acquired) or 2202 * calling thread is interrupted. 2203 * @throws java.lang.InterruptedException If interrupted. 2204 */ 2205 public void lock() throws InterruptedException 2206 { 2207 lock.lockInterruptibly(); 2208 } 2209 2210 /** 2211 * Releases the lock on this domain. 2212 */ 2213 public void release() 2214 { 2215 lock.unlock(); 2216 } 2217 2218 /** 2219 * Tries to acquire the lock on the domain within a given amount of time. 2220 * @param timeout The amount of milliseconds to wait for acquiring the lock. 2221 * @return True if the lock was acquired, false if timeout occurred. 2222 * @throws java.lang.InterruptedException When call was interrupted. 2223 */ 2224 public boolean tryLock(long timeout) throws InterruptedException 2225 { 2226 return lock.tryLock(timeout, TimeUnit.MILLISECONDS); 2227 } 2228 2229 /** 2230 * Starts the monitoring publisher for the domain if not already started. 2231 */ 2232 private void startMonitoringPublisher() 2233 { 2234 long period = localReplicationServer.getMonitoringPublisherPeriod(); 2235 if (period > 0) // 0 means no monitoring publisher 2236 { 2237 final MonitoringPublisher thread = new MonitoringPublisher(this, period); 2238 if (monitoringPublisher.compareAndSet(null, thread)) 2239 { 2240 thread.start(); 2241 } 2242 } 2243 } 2244 2245 /** 2246 * Stops the monitoring publisher for the domain. 2247 */ 2248 private void stopMonitoringPublisher() 2249 { 2250 final MonitoringPublisher thread = monitoringPublisher.get(); 2251 if (thread != null && monitoringPublisher.compareAndSet(thread, null)) 2252 { 2253 thread.shutdown(); 2254 thread.waitForShutdown(); 2255 } 2256 } 2257 2258 /** {@inheritDoc} */ 2259 @Override 2260 public void initializeMonitorProvider(MonitorProviderCfg configuraiton) 2261 { 2262 // Nothing to do for now 2263 } 2264 2265 /** {@inheritDoc} */ 2266 @Override 2267 public String getMonitorInstanceName() 2268 { 2269 return "Replication server RS(" + localReplicationServer.getServerId() 2270 + ") " + localReplicationServer.getServerURL() + ",cn=" 2271 + baseDN.toString().replace(',', '_').replace('=', '_') 2272 + ",cn=Replication"; 2273 } 2274 2275 @Override 2276 public MonitorData getMonitorData() 2277 { 2278 int serverId = localReplicationServer.getServerId(); 2279 2280 final MonitorData attributes = new MonitorData(5); 2281 attributes.add("replication-server-id", serverId); 2282 attributes.add("replication-server-port", localReplicationServer.getReplicationPort()); 2283 attributes.add("domain-name", baseDN); 2284 attributes.add("generation-id", baseDN + " " + generationId); 2285 attributes.add("missing-changes", getDomainMonitorData().getMissingChangesRS(serverId)); 2286 return attributes; 2287 } 2288 2289 /** 2290 * Returns the oldest known state for the domain, made of the oldest CSN 2291 * stored for each serverId. 2292 * <p> 2293 * Note: Because the replication changelogDB trimming always keep one change 2294 * whatever its date, the CSN contained in the returned state can be very old. 2295 * 2296 * @return the start state of the domain. 2297 */ 2298 public ServerState getOldestState() 2299 { 2300 return domainDB.getDomainOldestCSNs(baseDN); 2301 } 2302 2303 private void sendTopologyMsg(String type, ServerHandler handler, TopologyMsg msg) 2304 { 2305 for (int i = 1; i <= 2; i++) 2306 { 2307 if (!handler.shuttingDown() 2308 && handler.getStatus() != ServerStatus.NOT_CONNECTED_STATUS) 2309 { 2310 try 2311 { 2312 handler.sendTopoInfo(msg); 2313 break; 2314 } 2315 catch (IOException e) 2316 { 2317 if (i == 2) 2318 { 2319 logger.error(ERR_EXCEPTION_SENDING_TOPO_INFO, 2320 baseDN, type, handler.getServerId(), e.getMessage()); 2321 } 2322 } 2323 } 2324 sleep(100); 2325 } 2326 } 2327 2328 2329 2330 /** 2331 * Processes a ChangeTimeHeartbeatMsg received, by storing the CSN (timestamp) 2332 * value received, and forwarding the message to the other RSes. 2333 * @param senderHandler The handler for the server that sent the heartbeat. 2334 * @param msg The message to process. 2335 * @throws DirectoryException 2336 * if a problem occurs 2337 */ 2338 void processChangeTimeHeartbeatMsg(ServerHandler senderHandler, 2339 ChangeTimeHeartbeatMsg msg) throws DirectoryException 2340 { 2341 try 2342 { 2343 domainDB.replicaHeartbeat(baseDN, msg.getCSN()); 2344 } 2345 catch (ChangelogException e) 2346 { 2347 throw new DirectoryException(ResultCode.OPERATIONS_ERROR, e 2348 .getMessageObject(), e); 2349 } 2350 2351 if (senderHandler.isDataServer()) 2352 { 2353 /* 2354 * If we are the first replication server warned, then forward the message 2355 * to the remote replication servers. 2356 */ 2357 synchronized (pendingStatusMessagesLock) 2358 { 2359 pendingStatusMessages.enqueueChangeTimeHeartbeatMsg(msg); 2360 } 2361 statusAnalyzer.notifyPendingStatusMessage(); 2362 } 2363 } 2364 2365 /** 2366 * Return the monitor instance name of the ReplicationServer that created the 2367 * current instance. 2368 * 2369 * @return the monitor instance name of the ReplicationServer that created the 2370 * current instance. 2371 */ 2372 String getLocalRSMonitorInstanceName() 2373 { 2374 return this.localReplicationServer.getMonitorInstanceName(); 2375 } 2376 2377 /** 2378 * Return the serverId of the ReplicationServer that created the current 2379 * instance. 2380 * 2381 * @return the serverId of the ReplicationServer that created the current 2382 * instance. 2383 */ 2384 int getLocalRSServerId() 2385 { 2386 return this.localReplicationServer.getServerId(); 2387 } 2388 2389 /** 2390 * Update the monitoring publisher with the new period value. 2391 * 2392 * @param period 2393 * The new period value. 2394 */ 2395 void updateMonitoringPeriod(long period) 2396 { 2397 if (period == 0) 2398 { 2399 // Requested to stop monitoring publishers 2400 stopMonitoringPublisher(); 2401 return; 2402 } 2403 2404 final MonitoringPublisher mpThread = monitoringPublisher.get(); 2405 if (mpThread != null) // it is running 2406 { 2407 mpThread.setPeriod(period); 2408 } 2409 else if (!connectedDSs.isEmpty() || !connectedRSs.isEmpty()) 2410 { 2411 // Requested to start monitoring publishers with provided period value 2412 startMonitoringPublisher(); 2413 } 2414 } 2415 2416 /** 2417 * Registers a DS handler into this domain and notifies the domain about the 2418 * new DS. 2419 * 2420 * @param dsHandler 2421 * The Directory Server Handler to register 2422 */ 2423 public void register(DataServerHandler dsHandler) 2424 { 2425 startMonitoringPublisher(); 2426 2427 // connected with new DS: store handler. 2428 connectedDSs.put(dsHandler.getServerId(), dsHandler); 2429 2430 // Tell peer RSs and DSs a new DS just connected to us 2431 // No need to re-send TopologyMsg to this just new DS 2432 enqueueTopoInfoToAllExcept(dsHandler); 2433 } 2434 2435 /** 2436 * Registers the RS handler into this domain and notifies the domain. 2437 * 2438 * @param rsHandler 2439 * The Replication Server Handler to register 2440 */ 2441 public void register(ReplicationServerHandler rsHandler) 2442 { 2443 startMonitoringPublisher(); 2444 2445 // connected with new RS (either outgoing or incoming 2446 // connection): store handler. 2447 connectedRSs.put(rsHandler.getServerId(), rsHandler); 2448 } 2449 2450 private void debug(String message) 2451 { 2452 logger.trace("In ReplicationServerDomain serverId=" 2453 + localReplicationServer.getServerId() + " for baseDN=" + baseDN 2454 + " and port=" + localReplicationServer.getReplicationPort() 2455 + ": " + message); 2456 } 2457 2458 2459 2460 /** 2461 * Go through each connected DS, get the number of pending changes we have for 2462 * it and change status accordingly if threshold value is crossed/uncrossed. 2463 */ 2464 void checkDSDegradedStatus() 2465 { 2466 final int degradedStatusThreshold = localReplicationServer 2467 .getDegradedStatusThreshold(); 2468 // Threshold value = 0 means no status analyzer (no degrading system) 2469 // we should not have that as the status analyzer thread should not be 2470 // created if this is the case, but for sanity purpose, we add this 2471 // test 2472 if (degradedStatusThreshold > 0) 2473 { 2474 for (DataServerHandler serverHandler : connectedDSs.values()) 2475 { 2476 // Get number of pending changes for this server 2477 final int nChanges = serverHandler.getRcvMsgQueueSize(); 2478 if (logger.isTraceEnabled()) 2479 { 2480 logger.trace("In RS " + getLocalRSServerId() + ", for baseDN=" 2481 + getBaseDN() + ": " + "Status analyzer: DS " 2482 + serverHandler.getServerId() + " has " + nChanges 2483 + " message(s) in writer queue."); 2484 } 2485 2486 // Check status to know if it is relevant to change the status. Do not 2487 // take RSD lock to test. If we attempt to change the status whereas 2488 // the current status does allow it, this will be noticed by 2489 // the changeStatusFromStatusAnalyzer() method. This allows to take the 2490 // lock roughly only when needed versus every sleep time timeout. 2491 if (nChanges >= degradedStatusThreshold) 2492 { 2493 if (serverHandler.getStatus() == NORMAL_STATUS 2494 && changeStatus(serverHandler, TO_DEGRADED_STATUS_EVENT)) 2495 { 2496 break; // Interrupted. 2497 } 2498 } 2499 else 2500 { 2501 if (serverHandler.getStatus() == DEGRADED_STATUS 2502 && changeStatus(serverHandler, TO_NORMAL_STATUS_EVENT)) 2503 { 2504 break; // Interrupted. 2505 } 2506 } 2507 } 2508 } 2509 } 2510 2511 2512 2513 /** 2514 * Sends any enqueued status messages to the rest of the topology. 2515 */ 2516 void sendPendingStatusMessages() 2517 { 2518 /* 2519 * Take a snapshot of pending status notifications in order to avoid holding 2520 * the broadcast lock for too long. In addition, clear the notifications so 2521 * that they are not resent the next time. 2522 */ 2523 final PendingStatusMessages savedState; 2524 synchronized (pendingStatusMessagesLock) 2525 { 2526 savedState = pendingStatusMessages; 2527 pendingStatusMessages = new PendingStatusMessages(); 2528 } 2529 sendPendingChangeTimeHeartbeatMsgs(savedState); 2530 sendPendingTopologyMsgs(savedState); 2531 sendPendingMonitorMsgs(savedState); 2532 } 2533 2534 2535 2536 private void sendPendingMonitorMsgs(final PendingStatusMessages pendingMsgs) 2537 { 2538 for (Entry<Integer, MonitorMsg> msg : pendingMsgs.pendingDSMonitorMsgs 2539 .entrySet()) 2540 { 2541 ServerHandler ds = connectedDSs.get(msg.getKey()); 2542 if (ds != null) 2543 { 2544 try 2545 { 2546 ds.send(msg.getValue()); 2547 } 2548 catch (IOException e) 2549 { 2550 // Ignore: connection closed. 2551 } 2552 } 2553 } 2554 for (Entry<Integer, MonitorMsg> msg : pendingMsgs.pendingRSMonitorMsgs 2555 .entrySet()) 2556 { 2557 ServerHandler rs = connectedRSs.get(msg.getKey()); 2558 if (rs != null) 2559 { 2560 try 2561 { 2562 rs.send(msg.getValue()); 2563 } 2564 catch (IOException e) 2565 { 2566 // We log the error. The requestor will detect a timeout or 2567 // any other failure on the connection. 2568 2569 // FIXME: why do we log for RSs but not DSs? 2570 logger.traceException(e); 2571 logger.error(ERR_CHANGELOG_ERROR_SENDING_MSG, msg.getValue().getDestination()); 2572 } 2573 } 2574 } 2575 } 2576 2577 2578 2579 private void sendPendingChangeTimeHeartbeatMsgs(PendingStatusMessages pendingMsgs) 2580 { 2581 for (ChangeTimeHeartbeatMsg pendingHeartbeat : pendingMsgs.pendingHeartbeats.values()) 2582 { 2583 for (ReplicationServerHandler rsHandler : connectedRSs.values()) 2584 { 2585 try 2586 { 2587 if (rsHandler.getProtocolVersion() >= REPLICATION_PROTOCOL_V3) 2588 { 2589 rsHandler.send(pendingHeartbeat); 2590 } 2591 } 2592 catch (IOException e) 2593 { 2594 logger.traceException(e); 2595 logger.error(ERR_CHANGELOG_ERROR_SENDING_MSG, "Replication Server " 2596 + localReplicationServer.getReplicationPort() + " " + baseDN 2597 + " " + localReplicationServer.getServerId()); 2598 stopServer(rsHandler, false); 2599 } 2600 } 2601 } 2602 } 2603 2604 2605 2606 private void sendPendingTopologyMsgs(PendingStatusMessages pendingMsgs) 2607 { 2608 if (pendingMsgs.sendDSTopologyMsg) 2609 { 2610 for (ServerHandler handler : connectedDSs.values()) 2611 { 2612 if (handler.getServerId() != pendingMsgs.excludedDSForTopologyMsg) 2613 { 2614 final TopologyMsg topoMsg = createTopologyMsgForDS(handler 2615 .getServerId()); 2616 sendTopologyMsg("directory", handler, topoMsg); 2617 } 2618 } 2619 } 2620 2621 if (pendingMsgs.sendRSTopologyMsg && !connectedRSs.isEmpty()) 2622 { 2623 final TopologyMsg topoMsg = createTopologyMsgForRS(); 2624 for (ServerHandler handler : connectedRSs.values()) 2625 { 2626 sendTopologyMsg("replication", handler, topoMsg); 2627 } 2628 } 2629 } 2630 2631 2632 2633 private void enqueueMonitorMsg(MonitorRequestMsg msg, ServerHandler sender) 2634 { 2635 /* 2636 * If the request comes from a Directory Server we need to build the full 2637 * list of all servers in the topology and send back a MonitorMsg with the 2638 * full list of all the servers in the topology. 2639 */ 2640 if (sender.isDataServer()) 2641 { 2642 MonitorMsg monitorMsg = createGlobalTopologyMonitorMsg( 2643 msg.getDestination(), msg.getSenderID(), 2644 domainMonitor.getMonitorData()); 2645 synchronized (pendingStatusMessagesLock) 2646 { 2647 pendingStatusMessages.enqueueDSMonitorMsg(sender.getServerId(), 2648 monitorMsg); 2649 } 2650 } 2651 else 2652 { 2653 MonitorMsg monitorMsg = createLocalTopologyMonitorMsg( 2654 msg.getDestination(), msg.getSenderID()); 2655 synchronized (pendingStatusMessagesLock) 2656 { 2657 pendingStatusMessages.enqueueRSMonitorMsg(sender.getServerId(), 2658 monitorMsg); 2659 } 2660 } 2661 statusAnalyzer.notifyPendingStatusMessage(); 2662 } 2663}