001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hdfs.shortcircuit; 019 020import java.io.BufferedOutputStream; 021import java.io.Closeable; 022import java.io.DataInputStream; 023import java.io.DataOutputStream; 024import java.io.IOException; 025import java.nio.MappedByteBuffer; 026import java.util.HashMap; 027import java.util.Map; 028import java.util.Map.Entry; 029import java.util.TreeMap; 030import java.util.concurrent.ScheduledFuture; 031import java.util.concurrent.ScheduledThreadPoolExecutor; 032import java.util.concurrent.TimeUnit; 033import java.util.concurrent.locks.Condition; 034import java.util.concurrent.locks.ReentrantLock; 035 036import org.apache.commons.lang.mutable.MutableBoolean; 037import org.apache.commons.logging.Log; 038import org.apache.commons.logging.LogFactory; 039import org.apache.hadoop.classification.InterfaceAudience; 040import org.apache.hadoop.conf.Configuration; 041import org.apache.hadoop.hdfs.DFSConfigKeys; 042import org.apache.hadoop.hdfs.ExtendedBlockId; 043import org.apache.hadoop.hdfs.net.DomainPeer; 044import org.apache.hadoop.hdfs.protocol.DatanodeInfo; 045import org.apache.hadoop.hdfs.protocol.datatransfer.Sender; 046import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.ReleaseShortCircuitAccessResponseProto; 047import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.Status; 048import org.apache.hadoop.hdfs.protocolPB.PBHelper; 049import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.Slot; 050import org.apache.hadoop.io.IOUtils; 051import org.apache.hadoop.ipc.RetriableException; 052import org.apache.hadoop.net.unix.DomainSocket; 053import org.apache.hadoop.net.unix.DomainSocketWatcher; 054import org.apache.hadoop.security.token.SecretManager.InvalidToken; 055import org.apache.hadoop.util.StringUtils; 056import org.apache.hadoop.util.Time; 057import org.apache.hadoop.util.Waitable; 058 059import com.google.common.annotations.VisibleForTesting; 060import com.google.common.base.Preconditions; 061import com.google.common.util.concurrent.ThreadFactoryBuilder; 062 063/** 064 * The ShortCircuitCache tracks things which the client needs to access 065 * HDFS block files via short-circuit. 066 * 067 * These things include: memory-mapped regions, file descriptors, and shared 068 * memory areas for communicating with the DataNode. 069 */ 070@InterfaceAudience.Private 071public class ShortCircuitCache implements Closeable { 072 public static final Log LOG = LogFactory.getLog(ShortCircuitCache.class); 073 074 /** 075 * Expiry thread which makes sure that the file descriptors get closed 076 * after a while. 077 */ 078 private class CacheCleaner implements Runnable, Closeable { 079 private ScheduledFuture<?> future; 080 081 /** 082 * Run the CacheCleaner thread. 083 * 084 * Whenever a thread requests a ShortCircuitReplica object, we will make 085 * sure it gets one. That ShortCircuitReplica object can then be re-used 086 * when another thread requests a ShortCircuitReplica object for the same 087 * block. So in that sense, there is no maximum size to the cache. 088 * 089 * However, when a ShortCircuitReplica object is unreferenced by the 090 * thread(s) that are using it, it becomes evictable. There are two 091 * separate eviction lists-- one for mmaped objects, and another for 092 * non-mmaped objects. We do this in order to avoid having the regular 093 * files kick the mmaped files out of the cache too quickly. Reusing 094 * an already-existing mmap gives a huge performance boost, since the 095 * page table entries don't have to be re-populated. Both the mmap 096 * and non-mmap evictable lists have maximum sizes and maximum lifespans. 097 */ 098 @Override 099 public void run() { 100 ShortCircuitCache.this.lock.lock(); 101 try { 102 if (ShortCircuitCache.this.closed) return; 103 long curMs = Time.monotonicNow(); 104 105 if (LOG.isDebugEnabled()) { 106 LOG.debug(this + ": cache cleaner running at " + curMs); 107 } 108 109 int numDemoted = demoteOldEvictableMmaped(curMs); 110 int numPurged = 0; 111 Long evictionTimeNs = Long.valueOf(0); 112 while (true) { 113 Entry<Long, ShortCircuitReplica> entry = 114 evictable.ceilingEntry(evictionTimeNs); 115 if (entry == null) break; 116 evictionTimeNs = entry.getKey(); 117 long evictionTimeMs = 118 TimeUnit.MILLISECONDS.convert(evictionTimeNs, TimeUnit.NANOSECONDS); 119 if (evictionTimeMs + maxNonMmappedEvictableLifespanMs >= curMs) break; 120 ShortCircuitReplica replica = entry.getValue(); 121 if (LOG.isTraceEnabled()) { 122 LOG.trace("CacheCleaner: purging " + replica + ": " + 123 StringUtils.getStackTrace(Thread.currentThread())); 124 } 125 purge(replica); 126 numPurged++; 127 } 128 129 if (LOG.isDebugEnabled()) { 130 LOG.debug(this + ": finishing cache cleaner run started at " + 131 curMs + ". Demoted " + numDemoted + " mmapped replicas; " + 132 "purged " + numPurged + " replicas."); 133 } 134 } finally { 135 ShortCircuitCache.this.lock.unlock(); 136 } 137 } 138 139 @Override 140 public void close() throws IOException { 141 if (future != null) { 142 future.cancel(false); 143 } 144 } 145 146 public void setFuture(ScheduledFuture<?> future) { 147 this.future = future; 148 } 149 150 /** 151 * Get the rate at which this cleaner thread should be scheduled. 152 * 153 * We do this by taking the minimum expiration time and dividing by 4. 154 * 155 * @return the rate in milliseconds at which this thread should be 156 * scheduled. 157 */ 158 public long getRateInMs() { 159 long minLifespanMs = 160 Math.min(maxNonMmappedEvictableLifespanMs, 161 maxEvictableMmapedLifespanMs); 162 long sampleTimeMs = minLifespanMs / 4; 163 return (sampleTimeMs < 1) ? 1 : sampleTimeMs; 164 } 165 } 166 167 /** 168 * A task which asks the DataNode to release a short-circuit shared memory 169 * slot. If successful, this will tell the DataNode to stop monitoring 170 * changes to the mlock status of the replica associated with the slot. 171 * It will also allow us (the client) to re-use this slot for another 172 * replica. If we can't communicate with the DataNode for some reason, 173 * we tear down the shared memory segment to avoid being in an inconsistent 174 * state. 175 */ 176 private class SlotReleaser implements Runnable { 177 /** 178 * The slot that we need to release. 179 */ 180 private final Slot slot; 181 182 SlotReleaser(Slot slot) { 183 this.slot = slot; 184 } 185 186 @Override 187 public void run() { 188 if (LOG.isTraceEnabled()) { 189 LOG.trace(ShortCircuitCache.this + ": about to release " + slot); 190 } 191 final DfsClientShm shm = (DfsClientShm)slot.getShm(); 192 final DomainSocket shmSock = shm.getPeer().getDomainSocket(); 193 DomainSocket sock = null; 194 DataOutputStream out = null; 195 final String path = shmSock.getPath(); 196 boolean success = false; 197 try { 198 sock = DomainSocket.connect(path); 199 out = new DataOutputStream( 200 new BufferedOutputStream(sock.getOutputStream())); 201 new Sender(out).releaseShortCircuitFds(slot.getSlotId()); 202 DataInputStream in = new DataInputStream(sock.getInputStream()); 203 ReleaseShortCircuitAccessResponseProto resp = 204 ReleaseShortCircuitAccessResponseProto.parseFrom( 205 PBHelper.vintPrefixed(in)); 206 if (resp.getStatus() != Status.SUCCESS) { 207 String error = resp.hasError() ? resp.getError() : "(unknown)"; 208 throw new IOException(resp.getStatus().toString() + ": " + error); 209 } 210 if (LOG.isTraceEnabled()) { 211 LOG.trace(ShortCircuitCache.this + ": released " + slot); 212 } 213 success = true; 214 } catch (IOException e) { 215 LOG.error(ShortCircuitCache.this + ": failed to release " + 216 "short-circuit shared memory slot " + slot + " by sending " + 217 "ReleaseShortCircuitAccessRequestProto to " + path + 218 ". Closing shared memory segment.", e); 219 } finally { 220 if (success) { 221 shmManager.freeSlot(slot); 222 } else { 223 shm.getEndpointShmManager().shutdown(shm); 224 } 225 IOUtils.cleanup(LOG, sock, out); 226 } 227 } 228 } 229 230 public interface ShortCircuitReplicaCreator { 231 /** 232 * Attempt to create a ShortCircuitReplica object. 233 * 234 * This callback will be made without holding any locks. 235 * 236 * @return a non-null ShortCircuitReplicaInfo object. 237 */ 238 ShortCircuitReplicaInfo createShortCircuitReplicaInfo(); 239 } 240 241 /** 242 * Lock protecting the cache. 243 */ 244 private final ReentrantLock lock = new ReentrantLock(); 245 246 /** 247 * The executor service that runs the cacheCleaner. 248 */ 249 private final ScheduledThreadPoolExecutor cleanerExecutor 250 = new ScheduledThreadPoolExecutor(1, new ThreadFactoryBuilder(). 251 setDaemon(true).setNameFormat("ShortCircuitCache_Cleaner"). 252 build()); 253 254 /** 255 * The executor service that runs the cacheCleaner. 256 */ 257 private final ScheduledThreadPoolExecutor releaserExecutor 258 = new ScheduledThreadPoolExecutor(1, new ThreadFactoryBuilder(). 259 setDaemon(true).setNameFormat("ShortCircuitCache_SlotReleaser"). 260 build()); 261 262 /** 263 * A map containing all ShortCircuitReplicaInfo objects, organized by Key. 264 * ShortCircuitReplicaInfo objects may contain a replica, or an InvalidToken 265 * exception. 266 */ 267 private final HashMap<ExtendedBlockId, Waitable<ShortCircuitReplicaInfo>> 268 replicaInfoMap = new HashMap<ExtendedBlockId, 269 Waitable<ShortCircuitReplicaInfo>>(); 270 271 /** 272 * The CacheCleaner. We don't create this and schedule it until it becomes 273 * necessary. 274 */ 275 private CacheCleaner cacheCleaner; 276 277 /** 278 * Tree of evictable elements. 279 * 280 * Maps (unique) insertion time in nanoseconds to the element. 281 */ 282 private final TreeMap<Long, ShortCircuitReplica> evictable = 283 new TreeMap<Long, ShortCircuitReplica>(); 284 285 /** 286 * Maximum total size of the cache, including both mmapped and 287 * no$-mmapped elements. 288 */ 289 private final int maxTotalSize; 290 291 /** 292 * Non-mmaped elements older than this will be closed. 293 */ 294 private long maxNonMmappedEvictableLifespanMs; 295 296 /** 297 * Tree of mmaped evictable elements. 298 * 299 * Maps (unique) insertion time in nanoseconds to the element. 300 */ 301 private final TreeMap<Long, ShortCircuitReplica> evictableMmapped = 302 new TreeMap<Long, ShortCircuitReplica>(); 303 304 /** 305 * Maximum number of mmaped evictable elements. 306 */ 307 private int maxEvictableMmapedSize; 308 309 /** 310 * Mmaped elements older than this will be closed. 311 */ 312 private final long maxEvictableMmapedLifespanMs; 313 314 /** 315 * The minimum number of milliseconds we'll wait after an unsuccessful 316 * mmap attempt before trying again. 317 */ 318 private final long mmapRetryTimeoutMs; 319 320 /** 321 * How long we will keep replicas in the cache before declaring them 322 * to be stale. 323 */ 324 private final long staleThresholdMs; 325 326 /** 327 * True if the ShortCircuitCache is closed. 328 */ 329 private boolean closed = false; 330 331 /** 332 * Number of existing mmaps associated with this cache. 333 */ 334 private int outstandingMmapCount = 0; 335 336 /** 337 * Manages short-circuit shared memory segments for the client. 338 */ 339 private final DfsClientShmManager shmManager; 340 341 /** 342 * Create a {@link ShortCircuitCache} object from a {@link Configuration} 343 */ 344 public static ShortCircuitCache fromConf(Configuration conf) { 345 return new ShortCircuitCache( 346 conf.getInt(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_KEY, 347 DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_DEFAULT), 348 conf.getLong(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS_KEY, 349 DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS_DEFAULT), 350 conf.getInt(DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_SIZE, 351 DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_SIZE_DEFAULT), 352 conf.getLong(DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS, 353 DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS_DEFAULT), 354 conf.getLong(DFSConfigKeys.DFS_CLIENT_MMAP_RETRY_TIMEOUT_MS, 355 DFSConfigKeys.DFS_CLIENT_MMAP_RETRY_TIMEOUT_MS_DEFAULT), 356 conf.getLong(DFSConfigKeys.DFS_CLIENT_SHORT_CIRCUIT_REPLICA_STALE_THRESHOLD_MS, 357 DFSConfigKeys.DFS_CLIENT_SHORT_CIRCUIT_REPLICA_STALE_THRESHOLD_MS_DEFAULT), 358 conf.getInt(DFSConfigKeys.DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS, 359 DFSConfigKeys.DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS_DEFAULT)); 360 } 361 362 public ShortCircuitCache(int maxTotalSize, long maxNonMmappedEvictableLifespanMs, 363 int maxEvictableMmapedSize, long maxEvictableMmapedLifespanMs, 364 long mmapRetryTimeoutMs, long staleThresholdMs, int shmInterruptCheckMs) { 365 Preconditions.checkArgument(maxTotalSize >= 0); 366 this.maxTotalSize = maxTotalSize; 367 Preconditions.checkArgument(maxNonMmappedEvictableLifespanMs >= 0); 368 this.maxNonMmappedEvictableLifespanMs = maxNonMmappedEvictableLifespanMs; 369 Preconditions.checkArgument(maxEvictableMmapedSize >= 0); 370 this.maxEvictableMmapedSize = maxEvictableMmapedSize; 371 Preconditions.checkArgument(maxEvictableMmapedLifespanMs >= 0); 372 this.maxEvictableMmapedLifespanMs = maxEvictableMmapedLifespanMs; 373 this.mmapRetryTimeoutMs = mmapRetryTimeoutMs; 374 this.staleThresholdMs = staleThresholdMs; 375 DfsClientShmManager shmManager = null; 376 if ((shmInterruptCheckMs > 0) && 377 (DomainSocketWatcher.getLoadingFailureReason() == null)) { 378 try { 379 shmManager = new DfsClientShmManager(shmInterruptCheckMs); 380 } catch (IOException e) { 381 LOG.error("failed to create ShortCircuitShmManager", e); 382 } 383 } 384 this.shmManager = shmManager; 385 } 386 387 public long getStaleThresholdMs() { 388 return staleThresholdMs; 389 } 390 391 /** 392 * Increment the reference count of a replica, and remove it from any free 393 * list it may be in. 394 * 395 * You must hold the cache lock while calling this function. 396 * 397 * @param replica The replica we're removing. 398 */ 399 private void ref(ShortCircuitReplica replica) { 400 lock.lock(); 401 try { 402 Preconditions.checkArgument(replica.refCount > 0, 403 "can't ref %s because its refCount reached %d", replica, 404 replica.refCount); 405 Long evictableTimeNs = replica.getEvictableTimeNs(); 406 replica.refCount++; 407 if (evictableTimeNs != null) { 408 String removedFrom = removeEvictable(replica); 409 if (LOG.isTraceEnabled()) { 410 LOG.trace(this + ": " + removedFrom + 411 " no longer contains " + replica + ". refCount " + 412 (replica.refCount - 1) + " -> " + replica.refCount + 413 StringUtils.getStackTrace(Thread.currentThread())); 414 415 } 416 } else if (LOG.isTraceEnabled()) { 417 LOG.trace(this + ": replica refCount " + 418 (replica.refCount - 1) + " -> " + replica.refCount + 419 StringUtils.getStackTrace(Thread.currentThread())); 420 } 421 } finally { 422 lock.unlock(); 423 } 424 } 425 426 /** 427 * Unreference a replica. 428 * 429 * You must hold the cache lock while calling this function. 430 * 431 * @param replica The replica being unreferenced. 432 */ 433 void unref(ShortCircuitReplica replica) { 434 lock.lock(); 435 try { 436 // If the replica is stale or unusable, but we haven't purged it yet, 437 // let's do that. It would be a shame to evict a non-stale replica so 438 // that we could put a stale or unusable one into the cache. 439 if (!replica.purged) { 440 String purgeReason = null; 441 if (!replica.getDataStream().getChannel().isOpen()) { 442 purgeReason = "purging replica because its data channel is closed."; 443 } else if (!replica.getMetaStream().getChannel().isOpen()) { 444 purgeReason = "purging replica because its meta channel is closed."; 445 } else if (replica.isStale()) { 446 purgeReason = "purging replica because it is stale."; 447 } 448 if (purgeReason != null) { 449 LOG.debug(this + ": " + purgeReason); 450 purge(replica); 451 } 452 } 453 String addedString = ""; 454 boolean shouldTrimEvictionMaps = false; 455 int newRefCount = --replica.refCount; 456 if (newRefCount == 0) { 457 // Close replica, since there are no remaining references to it. 458 Preconditions.checkArgument(replica.purged, 459 "Replica %s reached a refCount of 0 without being purged", replica); 460 replica.close(); 461 } else if (newRefCount == 1) { 462 Preconditions.checkState(null == replica.getEvictableTimeNs(), 463 "Replica %s had a refCount higher than 1, " + 464 "but was still evictable (evictableTimeNs = %d)", 465 replica, replica.getEvictableTimeNs()); 466 if (!replica.purged) { 467 // Add the replica to the end of an eviction list. 468 // Eviction lists are sorted by time. 469 if (replica.hasMmap()) { 470 insertEvictable(System.nanoTime(), replica, evictableMmapped); 471 addedString = "added to evictableMmapped, "; 472 } else { 473 insertEvictable(System.nanoTime(), replica, evictable); 474 addedString = "added to evictable, "; 475 } 476 shouldTrimEvictionMaps = true; 477 } 478 } else { 479 Preconditions.checkArgument(replica.refCount >= 0, 480 "replica's refCount went negative (refCount = %d" + 481 " for %s)", replica.refCount, replica); 482 } 483 if (LOG.isTraceEnabled()) { 484 LOG.trace(this + ": unref replica " + replica + 485 ": " + addedString + " refCount " + 486 (newRefCount + 1) + " -> " + newRefCount + 487 StringUtils.getStackTrace(Thread.currentThread())); 488 } 489 if (shouldTrimEvictionMaps) { 490 trimEvictionMaps(); 491 } 492 } finally { 493 lock.unlock(); 494 } 495 } 496 497 /** 498 * Demote old evictable mmaps into the regular eviction map. 499 * 500 * You must hold the cache lock while calling this function. 501 * 502 * @param now Current time in monotonic milliseconds. 503 * @return Number of replicas demoted. 504 */ 505 private int demoteOldEvictableMmaped(long now) { 506 int numDemoted = 0; 507 boolean needMoreSpace = false; 508 Long evictionTimeNs = Long.valueOf(0); 509 510 while (true) { 511 Entry<Long, ShortCircuitReplica> entry = 512 evictableMmapped.ceilingEntry(evictionTimeNs); 513 if (entry == null) break; 514 evictionTimeNs = entry.getKey(); 515 long evictionTimeMs = 516 TimeUnit.MILLISECONDS.convert(evictionTimeNs, TimeUnit.NANOSECONDS); 517 if (evictionTimeMs + maxEvictableMmapedLifespanMs >= now) { 518 if (evictableMmapped.size() < maxEvictableMmapedSize) { 519 break; 520 } 521 needMoreSpace = true; 522 } 523 ShortCircuitReplica replica = entry.getValue(); 524 if (LOG.isTraceEnabled()) { 525 String rationale = needMoreSpace ? "because we need more space" : 526 "because it's too old"; 527 LOG.trace("demoteOldEvictable: demoting " + replica + ": " + 528 rationale + ": " + 529 StringUtils.getStackTrace(Thread.currentThread())); 530 } 531 removeEvictable(replica, evictableMmapped); 532 munmap(replica); 533 insertEvictable(evictionTimeNs, replica, evictable); 534 numDemoted++; 535 } 536 return numDemoted; 537 } 538 539 /** 540 * Trim the eviction lists. 541 */ 542 private void trimEvictionMaps() { 543 long now = Time.monotonicNow(); 544 demoteOldEvictableMmaped(now); 545 546 while (true) { 547 long evictableSize = evictable.size(); 548 long evictableMmappedSize = evictableMmapped.size(); 549 if (evictableSize + evictableMmappedSize <= maxTotalSize) { 550 return; 551 } 552 ShortCircuitReplica replica; 553 if (evictableSize == 0) { 554 replica = evictableMmapped.firstEntry().getValue(); 555 } else { 556 replica = evictable.firstEntry().getValue(); 557 } 558 if (LOG.isTraceEnabled()) { 559 LOG.trace(this + ": trimEvictionMaps is purging " + replica + 560 StringUtils.getStackTrace(Thread.currentThread())); 561 } 562 purge(replica); 563 } 564 } 565 566 /** 567 * Munmap a replica, updating outstandingMmapCount. 568 * 569 * @param replica The replica to munmap. 570 */ 571 private void munmap(ShortCircuitReplica replica) { 572 replica.munmap(); 573 outstandingMmapCount--; 574 } 575 576 /** 577 * Remove a replica from an evictable map. 578 * 579 * @param replica The replica to remove. 580 * @return The map it was removed from. 581 */ 582 private String removeEvictable(ShortCircuitReplica replica) { 583 if (replica.hasMmap()) { 584 removeEvictable(replica, evictableMmapped); 585 return "evictableMmapped"; 586 } else { 587 removeEvictable(replica, evictable); 588 return "evictable"; 589 } 590 } 591 592 /** 593 * Remove a replica from an evictable map. 594 * 595 * @param replica The replica to remove. 596 * @param map The map to remove it from. 597 */ 598 private void removeEvictable(ShortCircuitReplica replica, 599 TreeMap<Long, ShortCircuitReplica> map) { 600 Long evictableTimeNs = replica.getEvictableTimeNs(); 601 Preconditions.checkNotNull(evictableTimeNs); 602 ShortCircuitReplica removed = map.remove(evictableTimeNs); 603 Preconditions.checkState(removed == replica, 604 "failed to make %s unevictable", replica); 605 replica.setEvictableTimeNs(null); 606 } 607 608 /** 609 * Insert a replica into an evictable map. 610 * 611 * If an element already exists with this eviction time, we add a nanosecond 612 * to it until we find an unused key. 613 * 614 * @param evictionTimeNs The eviction time in absolute nanoseconds. 615 * @param replica The replica to insert. 616 * @param map The map to insert it into. 617 */ 618 private void insertEvictable(Long evictionTimeNs, 619 ShortCircuitReplica replica, TreeMap<Long, ShortCircuitReplica> map) { 620 while (map.containsKey(evictionTimeNs)) { 621 evictionTimeNs++; 622 } 623 Preconditions.checkState(null == replica.getEvictableTimeNs()); 624 replica.setEvictableTimeNs(evictionTimeNs); 625 map.put(evictionTimeNs, replica); 626 } 627 628 /** 629 * Purge a replica from the cache. 630 * 631 * This doesn't necessarily close the replica, since there may be 632 * outstanding references to it. However, it does mean the cache won't 633 * hand it out to anyone after this. 634 * 635 * You must hold the cache lock while calling this function. 636 * 637 * @param replica The replica being removed. 638 */ 639 private void purge(ShortCircuitReplica replica) { 640 boolean removedFromInfoMap = false; 641 String evictionMapName = null; 642 Preconditions.checkArgument(!replica.purged); 643 replica.purged = true; 644 Waitable<ShortCircuitReplicaInfo> val = replicaInfoMap.get(replica.key); 645 if (val != null) { 646 ShortCircuitReplicaInfo info = val.getVal(); 647 if ((info != null) && (info.getReplica() == replica)) { 648 replicaInfoMap.remove(replica.key); 649 removedFromInfoMap = true; 650 } 651 } 652 Long evictableTimeNs = replica.getEvictableTimeNs(); 653 if (evictableTimeNs != null) { 654 evictionMapName = removeEvictable(replica); 655 } 656 if (LOG.isTraceEnabled()) { 657 StringBuilder builder = new StringBuilder(); 658 builder.append(this).append(": ").append(": purged "). 659 append(replica).append(" from the cache."); 660 if (removedFromInfoMap) { 661 builder.append(" Removed from the replicaInfoMap."); 662 } 663 if (evictionMapName != null) { 664 builder.append(" Removed from ").append(evictionMapName); 665 } 666 LOG.trace(builder.toString()); 667 } 668 unref(replica); 669 } 670 671 /** 672 * Fetch or create a replica. 673 * 674 * You must hold the cache lock while calling this function. 675 * 676 * @param key Key to use for lookup. 677 * @param creator Replica creator callback. Will be called without 678 * the cache lock being held. 679 * 680 * @return Null if no replica could be found or created. 681 * The replica, otherwise. 682 */ 683 public ShortCircuitReplicaInfo fetchOrCreate(ExtendedBlockId key, 684 ShortCircuitReplicaCreator creator) { 685 Waitable<ShortCircuitReplicaInfo> newWaitable = null; 686 lock.lock(); 687 try { 688 ShortCircuitReplicaInfo info = null; 689 do { 690 if (closed) { 691 if (LOG.isTraceEnabled()) { 692 LOG.trace(this + ": can't fetchOrCreate " + key + 693 " because the cache is closed."); 694 } 695 return null; 696 } 697 Waitable<ShortCircuitReplicaInfo> waitable = replicaInfoMap.get(key); 698 if (waitable != null) { 699 try { 700 info = fetch(key, waitable); 701 } catch (RetriableException e) { 702 if (LOG.isDebugEnabled()) { 703 LOG.debug(this + ": retrying " + e.getMessage()); 704 } 705 continue; 706 } 707 } 708 } while (false); 709 if (info != null) return info; 710 // We need to load the replica ourselves. 711 newWaitable = new Waitable<ShortCircuitReplicaInfo>(lock.newCondition()); 712 replicaInfoMap.put(key, newWaitable); 713 } finally { 714 lock.unlock(); 715 } 716 return create(key, creator, newWaitable); 717 } 718 719 /** 720 * Fetch an existing ReplicaInfo object. 721 * 722 * @param key The key that we're using. 723 * @param waitable The waitable object to wait on. 724 * @return The existing ReplicaInfo object, or null if there is 725 * none. 726 * 727 * @throws RetriableException If the caller needs to retry. 728 */ 729 private ShortCircuitReplicaInfo fetch(ExtendedBlockId key, 730 Waitable<ShortCircuitReplicaInfo> waitable) throws RetriableException { 731 // Another thread is already in the process of loading this 732 // ShortCircuitReplica. So we simply wait for it to complete. 733 ShortCircuitReplicaInfo info; 734 try { 735 if (LOG.isTraceEnabled()) { 736 LOG.trace(this + ": found waitable for " + key); 737 } 738 info = waitable.await(); 739 } catch (InterruptedException e) { 740 LOG.info(this + ": interrupted while waiting for " + key); 741 Thread.currentThread().interrupt(); 742 throw new RetriableException("interrupted"); 743 } 744 if (info.getInvalidTokenException() != null) { 745 LOG.info(this + ": could not get " + key + " due to InvalidToken " + 746 "exception.", info.getInvalidTokenException()); 747 return info; 748 } 749 ShortCircuitReplica replica = info.getReplica(); 750 if (replica == null) { 751 LOG.warn(this + ": failed to get " + key); 752 return info; 753 } 754 if (replica.purged) { 755 // Ignore replicas that have already been purged from the cache. 756 throw new RetriableException("Ignoring purged replica " + 757 replica + ". Retrying."); 758 } 759 // Check if the replica is stale before using it. 760 // If it is, purge it and retry. 761 if (replica.isStale()) { 762 LOG.info(this + ": got stale replica " + replica + ". Removing " + 763 "this replica from the replicaInfoMap and retrying."); 764 // Remove the cache's reference to the replica. This may or may not 765 // trigger a close. 766 purge(replica); 767 throw new RetriableException("ignoring stale replica " + replica); 768 } 769 ref(replica); 770 return info; 771 } 772 773 private ShortCircuitReplicaInfo create(ExtendedBlockId key, 774 ShortCircuitReplicaCreator creator, 775 Waitable<ShortCircuitReplicaInfo> newWaitable) { 776 // Handle loading a new replica. 777 ShortCircuitReplicaInfo info = null; 778 try { 779 if (LOG.isTraceEnabled()) { 780 LOG.trace(this + ": loading " + key); 781 } 782 info = creator.createShortCircuitReplicaInfo(); 783 } catch (RuntimeException e) { 784 LOG.warn(this + ": failed to load " + key, e); 785 } 786 if (info == null) info = new ShortCircuitReplicaInfo(); 787 lock.lock(); 788 try { 789 if (info.getReplica() != null) { 790 // On success, make sure the cache cleaner thread is running. 791 if (LOG.isTraceEnabled()) { 792 LOG.trace(this + ": successfully loaded " + info.getReplica()); 793 } 794 startCacheCleanerThreadIfNeeded(); 795 // Note: new ShortCircuitReplicas start with a refCount of 2, 796 // indicating that both this cache and whoever requested the 797 // creation of the replica hold a reference. So we don't need 798 // to increment the reference count here. 799 } else { 800 // On failure, remove the waitable from the replicaInfoMap. 801 Waitable<ShortCircuitReplicaInfo> waitableInMap = replicaInfoMap.get(key); 802 if (waitableInMap == newWaitable) replicaInfoMap.remove(key); 803 if (info.getInvalidTokenException() != null) { 804 LOG.info(this + ": could not load " + key + " due to InvalidToken " + 805 "exception.", info.getInvalidTokenException()); 806 } else { 807 LOG.warn(this + ": failed to load " + key); 808 } 809 } 810 newWaitable.provide(info); 811 } finally { 812 lock.unlock(); 813 } 814 return info; 815 } 816 817 private void startCacheCleanerThreadIfNeeded() { 818 if (cacheCleaner == null) { 819 cacheCleaner = new CacheCleaner(); 820 long rateMs = cacheCleaner.getRateInMs(); 821 ScheduledFuture<?> future = 822 cleanerExecutor.scheduleAtFixedRate(cacheCleaner, rateMs, rateMs, 823 TimeUnit.MILLISECONDS); 824 cacheCleaner.setFuture(future); 825 if (LOG.isDebugEnabled()) { 826 LOG.debug(this + ": starting cache cleaner thread which will run " + 827 "every " + rateMs + " ms"); 828 } 829 } 830 } 831 832 ClientMmap getOrCreateClientMmap(ShortCircuitReplica replica, 833 boolean anchored) { 834 Condition newCond; 835 lock.lock(); 836 try { 837 while (replica.mmapData != null) { 838 if (replica.mmapData instanceof MappedByteBuffer) { 839 ref(replica); 840 MappedByteBuffer mmap = (MappedByteBuffer)replica.mmapData; 841 return new ClientMmap(replica, mmap, anchored); 842 } else if (replica.mmapData instanceof Long) { 843 long lastAttemptTimeMs = (Long)replica.mmapData; 844 long delta = Time.monotonicNow() - lastAttemptTimeMs; 845 if (delta < mmapRetryTimeoutMs) { 846 if (LOG.isTraceEnabled()) { 847 LOG.trace(this + ": can't create client mmap for " + 848 replica + " because we failed to " + 849 "create one just " + delta + "ms ago."); 850 } 851 return null; 852 } 853 if (LOG.isTraceEnabled()) { 854 LOG.trace(this + ": retrying client mmap for " + replica + 855 ", " + delta + " ms after the previous failure."); 856 } 857 } else if (replica.mmapData instanceof Condition) { 858 Condition cond = (Condition)replica.mmapData; 859 cond.awaitUninterruptibly(); 860 } else { 861 Preconditions.checkState(false, "invalid mmapData type %s", 862 replica.mmapData.getClass().getName()); 863 } 864 } 865 newCond = lock.newCondition(); 866 replica.mmapData = newCond; 867 } finally { 868 lock.unlock(); 869 } 870 MappedByteBuffer map = replica.loadMmapInternal(); 871 lock.lock(); 872 try { 873 if (map == null) { 874 replica.mmapData = Long.valueOf(Time.monotonicNow()); 875 newCond.signalAll(); 876 return null; 877 } else { 878 outstandingMmapCount++; 879 replica.mmapData = map; 880 ref(replica); 881 newCond.signalAll(); 882 return new ClientMmap(replica, map, anchored); 883 } 884 } finally { 885 lock.unlock(); 886 } 887 } 888 889 /** 890 * Close the cache and free all associated resources. 891 */ 892 @Override 893 public void close() { 894 try { 895 lock.lock(); 896 if (closed) return; 897 closed = true; 898 LOG.info(this + ": closing"); 899 maxNonMmappedEvictableLifespanMs = 0; 900 maxEvictableMmapedSize = 0; 901 // Close and join cacheCleaner thread. 902 IOUtils.cleanup(LOG, cacheCleaner); 903 // Purge all replicas. 904 while (true) { 905 Entry<Long, ShortCircuitReplica> entry = evictable.firstEntry(); 906 if (entry == null) break; 907 purge(entry.getValue()); 908 } 909 while (true) { 910 Entry<Long, ShortCircuitReplica> entry = evictableMmapped.firstEntry(); 911 if (entry == null) break; 912 purge(entry.getValue()); 913 } 914 } finally { 915 lock.unlock(); 916 } 917 IOUtils.cleanup(LOG, shmManager); 918 } 919 920 @VisibleForTesting // ONLY for testing 921 public interface CacheVisitor { 922 void visit(int numOutstandingMmaps, 923 Map<ExtendedBlockId, ShortCircuitReplica> replicas, 924 Map<ExtendedBlockId, InvalidToken> failedLoads, 925 Map<Long, ShortCircuitReplica> evictable, 926 Map<Long, ShortCircuitReplica> evictableMmapped); 927 } 928 929 @VisibleForTesting // ONLY for testing 930 public void accept(CacheVisitor visitor) { 931 lock.lock(); 932 try { 933 Map<ExtendedBlockId, ShortCircuitReplica> replicas = 934 new HashMap<ExtendedBlockId, ShortCircuitReplica>(); 935 Map<ExtendedBlockId, InvalidToken> failedLoads = 936 new HashMap<ExtendedBlockId, InvalidToken>(); 937 for (Entry<ExtendedBlockId, Waitable<ShortCircuitReplicaInfo>> entry : 938 replicaInfoMap.entrySet()) { 939 Waitable<ShortCircuitReplicaInfo> waitable = entry.getValue(); 940 if (waitable.hasVal()) { 941 if (waitable.getVal().getReplica() != null) { 942 replicas.put(entry.getKey(), waitable.getVal().getReplica()); 943 } else { 944 // The exception may be null here, indicating a failed load that 945 // isn't the result of an invalid block token. 946 failedLoads.put(entry.getKey(), 947 waitable.getVal().getInvalidTokenException()); 948 } 949 } 950 } 951 if (LOG.isDebugEnabled()) { 952 StringBuilder builder = new StringBuilder(); 953 builder.append("visiting ").append(visitor.getClass().getName()). 954 append("with outstandingMmapCount=").append(outstandingMmapCount). 955 append(", replicas="); 956 String prefix = ""; 957 for (Entry<ExtendedBlockId, ShortCircuitReplica> entry : replicas.entrySet()) { 958 builder.append(prefix).append(entry.getValue()); 959 prefix = ","; 960 } 961 prefix = ""; 962 builder.append(", failedLoads="); 963 for (Entry<ExtendedBlockId, InvalidToken> entry : failedLoads.entrySet()) { 964 builder.append(prefix).append(entry.getValue()); 965 prefix = ","; 966 } 967 prefix = ""; 968 builder.append(", evictable="); 969 for (Entry<Long, ShortCircuitReplica> entry : evictable.entrySet()) { 970 builder.append(prefix).append(entry.getKey()). 971 append(":").append(entry.getValue()); 972 prefix = ","; 973 } 974 prefix = ""; 975 builder.append(", evictableMmapped="); 976 for (Entry<Long, ShortCircuitReplica> entry : evictableMmapped.entrySet()) { 977 builder.append(prefix).append(entry.getKey()). 978 append(":").append(entry.getValue()); 979 prefix = ","; 980 } 981 LOG.debug(builder.toString()); 982 } 983 visitor.visit(outstandingMmapCount, replicas, failedLoads, 984 evictable, evictableMmapped); 985 } finally { 986 lock.unlock(); 987 } 988 } 989 990 @Override 991 public String toString() { 992 return "ShortCircuitCache(0x" + 993 Integer.toHexString(System.identityHashCode(this)) + ")"; 994 } 995 996 /** 997 * Allocate a new shared memory slot. 998 * 999 * @param datanode The datanode to allocate a shm slot with. 1000 * @param peer A peer connected to the datanode. 1001 * @param usedPeer Will be set to true if we use up the provided peer. 1002 * @param blockId The block id and block pool id of the block we're 1003 * allocating this slot for. 1004 * @param clientName The name of the DFSClient allocating the shared 1005 * memory. 1006 * @return Null if short-circuit shared memory is disabled; 1007 * a short-circuit memory slot otherwise. 1008 * @throws IOException An exception if there was an error talking to 1009 * the datanode. 1010 */ 1011 public Slot allocShmSlot(DatanodeInfo datanode, 1012 DomainPeer peer, MutableBoolean usedPeer, 1013 ExtendedBlockId blockId, String clientName) throws IOException { 1014 if (shmManager != null) { 1015 return shmManager.allocSlot(datanode, peer, usedPeer, 1016 blockId, clientName); 1017 } else { 1018 return null; 1019 } 1020 } 1021 1022 /** 1023 * Free a slot immediately. 1024 * 1025 * ONLY use this if the DataNode is not yet aware of the slot. 1026 * 1027 * @param slot The slot to free. 1028 */ 1029 public void freeSlot(Slot slot) { 1030 Preconditions.checkState(shmManager != null); 1031 slot.makeInvalid(); 1032 shmManager.freeSlot(slot); 1033 } 1034 1035 /** 1036 * Schedule a shared memory slot to be released. 1037 * 1038 * @param slot The slot to release. 1039 */ 1040 public void scheduleSlotReleaser(Slot slot) { 1041 Preconditions.checkState(shmManager != null); 1042 releaserExecutor.execute(new SlotReleaser(slot)); 1043 } 1044 1045 @VisibleForTesting 1046 public DfsClientShmManager getDfsClientShmManager() { 1047 return shmManager; 1048 } 1049}