001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs.shortcircuit;
019
020import java.io.BufferedOutputStream;
021import java.io.Closeable;
022import java.io.DataInputStream;
023import java.io.DataOutputStream;
024import java.io.IOException;
025import java.nio.MappedByteBuffer;
026import java.util.HashMap;
027import java.util.Map;
028import java.util.Map.Entry;
029import java.util.TreeMap;
030import java.util.concurrent.ScheduledFuture;
031import java.util.concurrent.ScheduledThreadPoolExecutor;
032import java.util.concurrent.TimeUnit;
033import java.util.concurrent.locks.Condition;
034import java.util.concurrent.locks.ReentrantLock;
035
036import org.apache.commons.lang.mutable.MutableBoolean;
037import org.apache.commons.logging.Log;
038import org.apache.commons.logging.LogFactory;
039import org.apache.hadoop.classification.InterfaceAudience;
040import org.apache.hadoop.conf.Configuration;
041import org.apache.hadoop.hdfs.DFSConfigKeys;
042import org.apache.hadoop.hdfs.ExtendedBlockId;
043import org.apache.hadoop.hdfs.net.DomainPeer;
044import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
045import org.apache.hadoop.hdfs.protocol.datatransfer.Sender;
046import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.ReleaseShortCircuitAccessResponseProto;
047import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.Status;
048import org.apache.hadoop.hdfs.protocolPB.PBHelper;
049import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.Slot;
050import org.apache.hadoop.io.IOUtils;
051import org.apache.hadoop.ipc.RetriableException;
052import org.apache.hadoop.net.unix.DomainSocket;
053import org.apache.hadoop.net.unix.DomainSocketWatcher;
054import org.apache.hadoop.security.token.SecretManager.InvalidToken;
055import org.apache.hadoop.util.StringUtils;
056import org.apache.hadoop.util.Time;
057import org.apache.hadoop.util.Waitable;
058
059import com.google.common.annotations.VisibleForTesting;
060import com.google.common.base.Preconditions;
061import com.google.common.util.concurrent.ThreadFactoryBuilder;
062
063/**
064 * The ShortCircuitCache tracks things which the client needs to access
065 * HDFS block files via short-circuit.
066 *
067 * These things include: memory-mapped regions, file descriptors, and shared
068 * memory areas for communicating with the DataNode.
069 */
070@InterfaceAudience.Private
071public class ShortCircuitCache implements Closeable {
072  public static final Log LOG = LogFactory.getLog(ShortCircuitCache.class);
073
074  /**
075   * Expiry thread which makes sure that the file descriptors get closed
076   * after a while.
077   */
078  private class CacheCleaner implements Runnable, Closeable {
079    private ScheduledFuture<?> future;
080
081    /**
082     * Run the CacheCleaner thread.
083     *
084     * Whenever a thread requests a ShortCircuitReplica object, we will make
085     * sure it gets one.  That ShortCircuitReplica object can then be re-used
086     * when another thread requests a ShortCircuitReplica object for the same
087     * block.  So in that sense, there is no maximum size to the cache.
088     *
089     * However, when a ShortCircuitReplica object is unreferenced by the
090     * thread(s) that are using it, it becomes evictable.  There are two
091     * separate eviction lists-- one for mmaped objects, and another for
092     * non-mmaped objects.  We do this in order to avoid having the regular
093     * files kick the mmaped files out of the cache too quickly.  Reusing
094     * an already-existing mmap gives a huge performance boost, since the
095     * page table entries don't have to be re-populated.  Both the mmap
096     * and non-mmap evictable lists have maximum sizes and maximum lifespans.
097     */
098    @Override
099    public void run() {
100      ShortCircuitCache.this.lock.lock();
101      try {
102        if (ShortCircuitCache.this.closed) return;
103        long curMs = Time.monotonicNow();
104
105        if (LOG.isDebugEnabled()) {
106          LOG.debug(this + ": cache cleaner running at " + curMs);
107        }
108
109        int numDemoted = demoteOldEvictableMmaped(curMs);
110        int numPurged = 0;
111        Long evictionTimeNs = Long.valueOf(0);
112        while (true) {
113          Entry<Long, ShortCircuitReplica> entry = 
114              evictable.ceilingEntry(evictionTimeNs);
115          if (entry == null) break;
116          evictionTimeNs = entry.getKey();
117          long evictionTimeMs = 
118              TimeUnit.MILLISECONDS.convert(evictionTimeNs, TimeUnit.NANOSECONDS);
119          if (evictionTimeMs + maxNonMmappedEvictableLifespanMs >= curMs) break;
120          ShortCircuitReplica replica = entry.getValue();
121          if (LOG.isTraceEnabled()) {
122            LOG.trace("CacheCleaner: purging " + replica + ": " + 
123                  StringUtils.getStackTrace(Thread.currentThread()));
124          }
125          purge(replica);
126          numPurged++;
127        }
128
129        if (LOG.isDebugEnabled()) {
130          LOG.debug(this + ": finishing cache cleaner run started at " +
131            curMs + ".  Demoted " + numDemoted + " mmapped replicas; " +
132            "purged " + numPurged + " replicas.");
133        }
134      } finally {
135        ShortCircuitCache.this.lock.unlock();
136      }
137    }
138
139    @Override
140    public void close() throws IOException {
141      if (future != null) {
142        future.cancel(false);
143      }
144    }
145
146    public void setFuture(ScheduledFuture<?> future) {
147      this.future = future;
148    }
149
150    /**
151     * Get the rate at which this cleaner thread should be scheduled.
152     *
153     * We do this by taking the minimum expiration time and dividing by 4.
154     *
155     * @return the rate in milliseconds at which this thread should be
156     *         scheduled.
157     */
158    public long getRateInMs() {
159      long minLifespanMs =
160          Math.min(maxNonMmappedEvictableLifespanMs,
161              maxEvictableMmapedLifespanMs);
162      long sampleTimeMs = minLifespanMs / 4;
163      return (sampleTimeMs < 1) ? 1 : sampleTimeMs;
164    }
165  }
166
167  /**
168   * A task which asks the DataNode to release a short-circuit shared memory
169   * slot.  If successful, this will tell the DataNode to stop monitoring
170   * changes to the mlock status of the replica associated with the slot.
171   * It will also allow us (the client) to re-use this slot for another
172   * replica.  If we can't communicate with the DataNode for some reason,
173   * we tear down the shared memory segment to avoid being in an inconsistent
174   * state.
175   */
176  private class SlotReleaser implements Runnable {
177    /**
178     * The slot that we need to release.
179     */
180    private final Slot slot;
181
182    SlotReleaser(Slot slot) {
183      this.slot = slot;
184    }
185
186    @Override
187    public void run() {
188      if (LOG.isTraceEnabled()) {
189        LOG.trace(ShortCircuitCache.this + ": about to release " + slot);
190      }
191      final DfsClientShm shm = (DfsClientShm)slot.getShm();
192      final DomainSocket shmSock = shm.getPeer().getDomainSocket();
193      DomainSocket sock = null;
194      DataOutputStream out = null;
195      final String path = shmSock.getPath();
196      boolean success = false;
197      try {
198        sock = DomainSocket.connect(path);
199        out = new DataOutputStream(
200            new BufferedOutputStream(sock.getOutputStream()));
201        new Sender(out).releaseShortCircuitFds(slot.getSlotId());
202        DataInputStream in = new DataInputStream(sock.getInputStream());
203        ReleaseShortCircuitAccessResponseProto resp =
204            ReleaseShortCircuitAccessResponseProto.parseFrom(
205                PBHelper.vintPrefixed(in));
206        if (resp.getStatus() != Status.SUCCESS) {
207          String error = resp.hasError() ? resp.getError() : "(unknown)";
208          throw new IOException(resp.getStatus().toString() + ": " + error);
209        }
210        if (LOG.isTraceEnabled()) {
211          LOG.trace(ShortCircuitCache.this + ": released " + slot);
212        }
213        success = true;
214      } catch (IOException e) {
215        LOG.error(ShortCircuitCache.this + ": failed to release " +
216            "short-circuit shared memory slot " + slot + " by sending " +
217            "ReleaseShortCircuitAccessRequestProto to " + path +
218            ".  Closing shared memory segment.", e);
219      } finally {
220        if (success) {
221          shmManager.freeSlot(slot);
222        } else {
223          shm.getEndpointShmManager().shutdown(shm);
224        }
225        IOUtils.cleanup(LOG, sock, out);
226      }
227    }
228  }
229
230  public interface ShortCircuitReplicaCreator {
231    /**
232     * Attempt to create a ShortCircuitReplica object.
233     *
234     * This callback will be made without holding any locks.
235     *
236     * @return a non-null ShortCircuitReplicaInfo object.
237     */
238    ShortCircuitReplicaInfo createShortCircuitReplicaInfo();
239  }
240
241  /**
242   * Lock protecting the cache.
243   */
244  private final ReentrantLock lock = new ReentrantLock();
245
246  /**
247   * The executor service that runs the cacheCleaner.
248   */
249  private final ScheduledThreadPoolExecutor cleanerExecutor
250  = new ScheduledThreadPoolExecutor(1, new ThreadFactoryBuilder().
251          setDaemon(true).setNameFormat("ShortCircuitCache_Cleaner").
252          build());
253
254  /**
255   * The executor service that runs the cacheCleaner.
256   */
257  private final ScheduledThreadPoolExecutor releaserExecutor
258      = new ScheduledThreadPoolExecutor(1, new ThreadFactoryBuilder().
259          setDaemon(true).setNameFormat("ShortCircuitCache_SlotReleaser").
260          build());
261
262  /**
263   * A map containing all ShortCircuitReplicaInfo objects, organized by Key.
264   * ShortCircuitReplicaInfo objects may contain a replica, or an InvalidToken
265   * exception.
266   */
267  private final HashMap<ExtendedBlockId, Waitable<ShortCircuitReplicaInfo>> 
268      replicaInfoMap = new HashMap<ExtendedBlockId,
269          Waitable<ShortCircuitReplicaInfo>>();
270
271  /**
272   * The CacheCleaner.  We don't create this and schedule it until it becomes
273   * necessary.
274   */
275  private CacheCleaner cacheCleaner;
276
277  /**
278   * Tree of evictable elements.
279   *
280   * Maps (unique) insertion time in nanoseconds to the element.
281   */
282  private final TreeMap<Long, ShortCircuitReplica> evictable =
283      new TreeMap<Long, ShortCircuitReplica>();
284
285  /**
286   * Maximum total size of the cache, including both mmapped and
287   * no$-mmapped elements.
288   */
289  private final int maxTotalSize;
290
291  /**
292   * Non-mmaped elements older than this will be closed.
293   */
294  private long maxNonMmappedEvictableLifespanMs;
295
296  /**
297   * Tree of mmaped evictable elements.
298   *
299   * Maps (unique) insertion time in nanoseconds to the element.
300   */
301  private final TreeMap<Long, ShortCircuitReplica> evictableMmapped =
302      new TreeMap<Long, ShortCircuitReplica>();
303
304  /**
305   * Maximum number of mmaped evictable elements.
306   */
307  private int maxEvictableMmapedSize;
308
309  /**
310   * Mmaped elements older than this will be closed.
311   */
312  private final long maxEvictableMmapedLifespanMs;
313
314  /**
315   * The minimum number of milliseconds we'll wait after an unsuccessful
316   * mmap attempt before trying again.
317   */
318  private final long mmapRetryTimeoutMs;
319
320  /**
321   * How long we will keep replicas in the cache before declaring them
322   * to be stale.
323   */
324  private final long staleThresholdMs;
325
326  /**
327   * True if the ShortCircuitCache is closed.
328   */
329  private boolean closed = false;
330
331  /**
332   * Number of existing mmaps associated with this cache.
333   */
334  private int outstandingMmapCount = 0;
335
336  /**
337   * Manages short-circuit shared memory segments for the client.
338   */
339  private final DfsClientShmManager shmManager;
340
341  /**
342   * Create a {@link ShortCircuitCache} object from a {@link Configuration}
343   */
344  public static ShortCircuitCache fromConf(Configuration conf) {
345    return new ShortCircuitCache(
346        conf.getInt(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_KEY,
347            DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_DEFAULT),
348        conf.getLong(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS_KEY,
349            DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS_DEFAULT),
350        conf.getInt(DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_SIZE,
351            DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_SIZE_DEFAULT),
352        conf.getLong(DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS,
353            DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS_DEFAULT),
354        conf.getLong(DFSConfigKeys.DFS_CLIENT_MMAP_RETRY_TIMEOUT_MS,
355            DFSConfigKeys.DFS_CLIENT_MMAP_RETRY_TIMEOUT_MS_DEFAULT),
356        conf.getLong(DFSConfigKeys.DFS_CLIENT_SHORT_CIRCUIT_REPLICA_STALE_THRESHOLD_MS,
357            DFSConfigKeys.DFS_CLIENT_SHORT_CIRCUIT_REPLICA_STALE_THRESHOLD_MS_DEFAULT),
358        conf.getInt(DFSConfigKeys.DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS,
359            DFSConfigKeys.DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS_DEFAULT));
360  }
361
362  public ShortCircuitCache(int maxTotalSize, long maxNonMmappedEvictableLifespanMs,
363      int maxEvictableMmapedSize, long maxEvictableMmapedLifespanMs,
364      long mmapRetryTimeoutMs, long staleThresholdMs, int shmInterruptCheckMs) {
365    Preconditions.checkArgument(maxTotalSize >= 0);
366    this.maxTotalSize = maxTotalSize;
367    Preconditions.checkArgument(maxNonMmappedEvictableLifespanMs >= 0);
368    this.maxNonMmappedEvictableLifespanMs = maxNonMmappedEvictableLifespanMs;
369    Preconditions.checkArgument(maxEvictableMmapedSize >= 0);
370    this.maxEvictableMmapedSize = maxEvictableMmapedSize;
371    Preconditions.checkArgument(maxEvictableMmapedLifespanMs >= 0);
372    this.maxEvictableMmapedLifespanMs = maxEvictableMmapedLifespanMs;
373    this.mmapRetryTimeoutMs = mmapRetryTimeoutMs;
374    this.staleThresholdMs = staleThresholdMs;
375    DfsClientShmManager shmManager = null;
376    if ((shmInterruptCheckMs > 0) &&
377        (DomainSocketWatcher.getLoadingFailureReason() == null)) {
378      try {
379        shmManager = new DfsClientShmManager(shmInterruptCheckMs);
380      } catch (IOException e) {
381        LOG.error("failed to create ShortCircuitShmManager", e);
382      }
383    }
384    this.shmManager = shmManager;
385  }
386
387  public long getStaleThresholdMs() {
388    return staleThresholdMs;
389  }
390
391  /**
392   * Increment the reference count of a replica, and remove it from any free
393   * list it may be in.
394   *
395   * You must hold the cache lock while calling this function.
396   *
397   * @param replica      The replica we're removing.
398   */
399  private void ref(ShortCircuitReplica replica) {
400    lock.lock();
401    try {
402      Preconditions.checkArgument(replica.refCount > 0,
403          "can't ref %s because its refCount reached %d", replica,
404          replica.refCount);
405      Long evictableTimeNs = replica.getEvictableTimeNs();
406      replica.refCount++;
407      if (evictableTimeNs != null) {
408        String removedFrom = removeEvictable(replica);
409        if (LOG.isTraceEnabled()) {
410          LOG.trace(this + ": " + removedFrom +
411              " no longer contains " + replica + ".  refCount " +
412              (replica.refCount - 1) + " -> " + replica.refCount +
413              StringUtils.getStackTrace(Thread.currentThread()));
414
415        }
416      } else if (LOG.isTraceEnabled()) {
417        LOG.trace(this + ": replica  refCount " +
418            (replica.refCount - 1) + " -> " + replica.refCount +
419            StringUtils.getStackTrace(Thread.currentThread()));
420      }
421    } finally {
422      lock.unlock();
423    }
424  }
425
426  /**
427   * Unreference a replica.
428   *
429   * You must hold the cache lock while calling this function.
430   *
431   * @param replica   The replica being unreferenced.
432   */
433  void unref(ShortCircuitReplica replica) {
434    lock.lock();
435    try {
436      // If the replica is stale or unusable, but we haven't purged it yet,
437      // let's do that.  It would be a shame to evict a non-stale replica so
438      // that we could put a stale or unusable one into the cache.
439      if (!replica.purged) {
440        String purgeReason = null;
441        if (!replica.getDataStream().getChannel().isOpen()) {
442          purgeReason = "purging replica because its data channel is closed.";
443        } else if (!replica.getMetaStream().getChannel().isOpen()) {
444          purgeReason = "purging replica because its meta channel is closed.";
445        } else if (replica.isStale()) {
446          purgeReason = "purging replica because it is stale.";
447        }
448        if (purgeReason != null) {
449          LOG.debug(this + ": " + purgeReason);
450          purge(replica);
451        }
452      }
453      String addedString = "";
454      boolean shouldTrimEvictionMaps = false;
455      int newRefCount = --replica.refCount;
456      if (newRefCount == 0) {
457        // Close replica, since there are no remaining references to it.
458        Preconditions.checkArgument(replica.purged,
459          "Replica %s reached a refCount of 0 without being purged", replica);
460        replica.close();
461      } else if (newRefCount == 1) {
462        Preconditions.checkState(null == replica.getEvictableTimeNs(),
463            "Replica %s had a refCount higher than 1, " +
464              "but was still evictable (evictableTimeNs = %d)",
465              replica, replica.getEvictableTimeNs());
466        if (!replica.purged) {
467          // Add the replica to the end of an eviction list.
468          // Eviction lists are sorted by time.
469          if (replica.hasMmap()) {
470            insertEvictable(System.nanoTime(), replica, evictableMmapped);
471            addedString = "added to evictableMmapped, ";
472          } else {
473            insertEvictable(System.nanoTime(), replica, evictable);
474            addedString = "added to evictable, ";
475          }
476          shouldTrimEvictionMaps = true;
477        }
478      } else {
479        Preconditions.checkArgument(replica.refCount >= 0,
480            "replica's refCount went negative (refCount = %d" +
481            " for %s)", replica.refCount, replica);
482      }
483      if (LOG.isTraceEnabled()) {
484        LOG.trace(this + ": unref replica " + replica +
485            ": " + addedString + " refCount " +
486            (newRefCount + 1) + " -> " + newRefCount +
487            StringUtils.getStackTrace(Thread.currentThread()));
488      }
489      if (shouldTrimEvictionMaps) {
490        trimEvictionMaps();
491      }
492    } finally {
493      lock.unlock();
494    }
495  }
496
497  /**
498   * Demote old evictable mmaps into the regular eviction map.
499   *
500   * You must hold the cache lock while calling this function.
501   *
502   * @param now   Current time in monotonic milliseconds.
503   * @return      Number of replicas demoted.
504   */
505  private int demoteOldEvictableMmaped(long now) {
506    int numDemoted = 0;
507    boolean needMoreSpace = false;
508    Long evictionTimeNs = Long.valueOf(0);
509
510    while (true) {
511      Entry<Long, ShortCircuitReplica> entry = 
512          evictableMmapped.ceilingEntry(evictionTimeNs);
513      if (entry == null) break;
514      evictionTimeNs = entry.getKey();
515      long evictionTimeMs = 
516          TimeUnit.MILLISECONDS.convert(evictionTimeNs, TimeUnit.NANOSECONDS);
517      if (evictionTimeMs + maxEvictableMmapedLifespanMs >= now) {
518        if (evictableMmapped.size() < maxEvictableMmapedSize) {
519          break;
520        }
521        needMoreSpace = true;
522      }
523      ShortCircuitReplica replica = entry.getValue();
524      if (LOG.isTraceEnabled()) {
525        String rationale = needMoreSpace ? "because we need more space" : 
526            "because it's too old";
527        LOG.trace("demoteOldEvictable: demoting " + replica + ": " +
528            rationale + ": " +
529            StringUtils.getStackTrace(Thread.currentThread()));
530      }
531      removeEvictable(replica, evictableMmapped);
532      munmap(replica);
533      insertEvictable(evictionTimeNs, replica, evictable);
534      numDemoted++;
535    }
536    return numDemoted;
537  }
538
539  /**
540   * Trim the eviction lists.
541   */
542  private void trimEvictionMaps() {
543    long now = Time.monotonicNow();
544    demoteOldEvictableMmaped(now);
545
546    while (true) {
547      long evictableSize = evictable.size();
548      long evictableMmappedSize = evictableMmapped.size();
549      if (evictableSize + evictableMmappedSize <= maxTotalSize) {
550        return;
551      }
552      ShortCircuitReplica replica;
553      if (evictableSize == 0) {
554       replica = evictableMmapped.firstEntry().getValue();
555      } else {
556       replica = evictable.firstEntry().getValue();
557      }
558      if (LOG.isTraceEnabled()) {
559        LOG.trace(this + ": trimEvictionMaps is purging " + replica +
560          StringUtils.getStackTrace(Thread.currentThread()));
561      }
562      purge(replica);
563    }
564  }
565
566  /**
567   * Munmap a replica, updating outstandingMmapCount.
568   *
569   * @param replica  The replica to munmap.
570   */
571  private void munmap(ShortCircuitReplica replica) {
572    replica.munmap();
573    outstandingMmapCount--;
574  }
575
576  /**
577   * Remove a replica from an evictable map.
578   *
579   * @param replica   The replica to remove.
580   * @return          The map it was removed from.
581   */
582  private String removeEvictable(ShortCircuitReplica replica) {
583    if (replica.hasMmap()) {
584      removeEvictable(replica, evictableMmapped);
585      return "evictableMmapped";
586    } else {
587      removeEvictable(replica, evictable);
588      return "evictable";
589    }
590  }
591
592  /**
593   * Remove a replica from an evictable map.
594   *
595   * @param replica   The replica to remove.
596   * @param map       The map to remove it from.
597   */
598  private void removeEvictable(ShortCircuitReplica replica,
599      TreeMap<Long, ShortCircuitReplica> map) {
600    Long evictableTimeNs = replica.getEvictableTimeNs();
601    Preconditions.checkNotNull(evictableTimeNs);
602    ShortCircuitReplica removed = map.remove(evictableTimeNs);
603    Preconditions.checkState(removed == replica,
604        "failed to make %s unevictable", replica);
605    replica.setEvictableTimeNs(null);
606  }
607
608  /**
609   * Insert a replica into an evictable map.
610   *
611   * If an element already exists with this eviction time, we add a nanosecond
612   * to it until we find an unused key.
613   *
614   * @param evictionTimeNs   The eviction time in absolute nanoseconds.
615   * @param replica          The replica to insert.
616   * @param map              The map to insert it into.
617   */
618  private void insertEvictable(Long evictionTimeNs,
619      ShortCircuitReplica replica, TreeMap<Long, ShortCircuitReplica> map) {
620    while (map.containsKey(evictionTimeNs)) {
621      evictionTimeNs++;
622    }
623    Preconditions.checkState(null == replica.getEvictableTimeNs());
624    replica.setEvictableTimeNs(evictionTimeNs);
625    map.put(evictionTimeNs, replica);
626  }
627
628  /**
629   * Purge a replica from the cache.
630   *
631   * This doesn't necessarily close the replica, since there may be
632   * outstanding references to it.  However, it does mean the cache won't
633   * hand it out to anyone after this.
634   *
635   * You must hold the cache lock while calling this function.
636   *
637   * @param replica   The replica being removed.
638   */
639  private void purge(ShortCircuitReplica replica) {
640    boolean removedFromInfoMap = false;
641    String evictionMapName = null;
642    Preconditions.checkArgument(!replica.purged);
643    replica.purged = true;
644    Waitable<ShortCircuitReplicaInfo> val = replicaInfoMap.get(replica.key);
645    if (val != null) {
646      ShortCircuitReplicaInfo info = val.getVal();
647      if ((info != null) && (info.getReplica() == replica)) {
648        replicaInfoMap.remove(replica.key);
649        removedFromInfoMap = true;
650      }
651    }
652    Long evictableTimeNs = replica.getEvictableTimeNs();
653    if (evictableTimeNs != null) {
654      evictionMapName = removeEvictable(replica);
655    }
656    if (LOG.isTraceEnabled()) {
657      StringBuilder builder = new StringBuilder();
658      builder.append(this).append(": ").append(": purged ").
659          append(replica).append(" from the cache.");
660      if (removedFromInfoMap) {
661        builder.append("  Removed from the replicaInfoMap.");
662      }
663      if (evictionMapName != null) {
664        builder.append("  Removed from ").append(evictionMapName);
665      }
666      LOG.trace(builder.toString());
667    }
668    unref(replica);
669  }
670
671  /**
672   * Fetch or create a replica.
673   *
674   * You must hold the cache lock while calling this function.
675   *
676   * @param key          Key to use for lookup.
677   * @param creator      Replica creator callback.  Will be called without
678   *                     the cache lock being held.
679   *
680   * @return             Null if no replica could be found or created.
681   *                     The replica, otherwise.
682   */
683  public ShortCircuitReplicaInfo fetchOrCreate(ExtendedBlockId key,
684      ShortCircuitReplicaCreator creator) {
685    Waitable<ShortCircuitReplicaInfo> newWaitable = null;
686    lock.lock();
687    try {
688      ShortCircuitReplicaInfo info = null;
689      do {
690        if (closed) {
691          if (LOG.isTraceEnabled()) {
692            LOG.trace(this + ": can't fetchOrCreate " + key +
693                " because the cache is closed.");
694          }
695          return null;
696        }
697        Waitable<ShortCircuitReplicaInfo> waitable = replicaInfoMap.get(key);
698        if (waitable != null) {
699          try {
700            info = fetch(key, waitable);
701          } catch (RetriableException e) {
702            if (LOG.isDebugEnabled()) {
703              LOG.debug(this + ": retrying " + e.getMessage());
704            }
705            continue;
706          }
707        }
708      } while (false);
709      if (info != null) return info;
710      // We need to load the replica ourselves.
711      newWaitable = new Waitable<ShortCircuitReplicaInfo>(lock.newCondition());
712      replicaInfoMap.put(key, newWaitable);
713    } finally {
714      lock.unlock();
715    }
716    return create(key, creator, newWaitable);
717  }
718
719  /**
720   * Fetch an existing ReplicaInfo object.
721   *
722   * @param key       The key that we're using.
723   * @param waitable  The waitable object to wait on.
724   * @return          The existing ReplicaInfo object, or null if there is
725   *                  none.
726   *
727   * @throws RetriableException   If the caller needs to retry.
728   */
729  private ShortCircuitReplicaInfo fetch(ExtendedBlockId key,
730      Waitable<ShortCircuitReplicaInfo> waitable) throws RetriableException {
731    // Another thread is already in the process of loading this
732    // ShortCircuitReplica.  So we simply wait for it to complete.
733    ShortCircuitReplicaInfo info;
734    try {
735      if (LOG.isTraceEnabled()) {
736        LOG.trace(this + ": found waitable for " + key);
737      }
738      info = waitable.await();
739    } catch (InterruptedException e) {
740      LOG.info(this + ": interrupted while waiting for " + key);
741      Thread.currentThread().interrupt();
742      throw new RetriableException("interrupted");
743    }
744    if (info.getInvalidTokenException() != null) {
745      LOG.info(this + ": could not get " + key + " due to InvalidToken " +
746            "exception.", info.getInvalidTokenException());
747      return info;
748    }
749    ShortCircuitReplica replica = info.getReplica();
750    if (replica == null) {
751      LOG.warn(this + ": failed to get " + key);
752      return info;
753    }
754    if (replica.purged) {
755      // Ignore replicas that have already been purged from the cache.
756      throw new RetriableException("Ignoring purged replica " +
757          replica + ".  Retrying.");
758    }
759    // Check if the replica is stale before using it.
760    // If it is, purge it and retry.
761    if (replica.isStale()) {
762      LOG.info(this + ": got stale replica " + replica + ".  Removing " +
763          "this replica from the replicaInfoMap and retrying.");
764      // Remove the cache's reference to the replica.  This may or may not
765      // trigger a close.
766      purge(replica);
767      throw new RetriableException("ignoring stale replica " + replica);
768    }
769    ref(replica);
770    return info;
771  }
772
773  private ShortCircuitReplicaInfo create(ExtendedBlockId key,
774      ShortCircuitReplicaCreator creator,
775      Waitable<ShortCircuitReplicaInfo> newWaitable) {
776    // Handle loading a new replica.
777    ShortCircuitReplicaInfo info = null;
778    try {
779      if (LOG.isTraceEnabled()) {
780        LOG.trace(this + ": loading " + key);
781      }
782      info = creator.createShortCircuitReplicaInfo();
783    } catch (RuntimeException e) {
784      LOG.warn(this + ": failed to load " + key, e);
785    }
786    if (info == null) info = new ShortCircuitReplicaInfo();
787    lock.lock();
788    try {
789      if (info.getReplica() != null) {
790        // On success, make sure the cache cleaner thread is running.
791        if (LOG.isTraceEnabled()) {
792          LOG.trace(this + ": successfully loaded " + info.getReplica());
793        }
794        startCacheCleanerThreadIfNeeded();
795        // Note: new ShortCircuitReplicas start with a refCount of 2,
796        // indicating that both this cache and whoever requested the 
797        // creation of the replica hold a reference.  So we don't need
798        // to increment the reference count here.
799      } else {
800        // On failure, remove the waitable from the replicaInfoMap.
801        Waitable<ShortCircuitReplicaInfo> waitableInMap = replicaInfoMap.get(key);
802        if (waitableInMap == newWaitable) replicaInfoMap.remove(key);
803        if (info.getInvalidTokenException() != null) {
804          LOG.info(this + ": could not load " + key + " due to InvalidToken " +
805              "exception.", info.getInvalidTokenException());
806        } else {
807          LOG.warn(this + ": failed to load " + key);
808        }
809      }
810      newWaitable.provide(info);
811    } finally {
812      lock.unlock();
813    }
814    return info;
815  }
816
817  private void startCacheCleanerThreadIfNeeded() {
818    if (cacheCleaner == null) {
819      cacheCleaner = new CacheCleaner();
820      long rateMs = cacheCleaner.getRateInMs();
821      ScheduledFuture<?> future =
822          cleanerExecutor.scheduleAtFixedRate(cacheCleaner, rateMs, rateMs,
823              TimeUnit.MILLISECONDS);
824      cacheCleaner.setFuture(future);
825      if (LOG.isDebugEnabled()) {
826        LOG.debug(this + ": starting cache cleaner thread which will run " +
827          "every " + rateMs + " ms");
828      }
829    }
830  }
831
832  ClientMmap getOrCreateClientMmap(ShortCircuitReplica replica,
833      boolean anchored) {
834    Condition newCond;
835    lock.lock();
836    try {
837      while (replica.mmapData != null) {
838        if (replica.mmapData instanceof MappedByteBuffer) {
839          ref(replica);
840          MappedByteBuffer mmap = (MappedByteBuffer)replica.mmapData;
841          return new ClientMmap(replica, mmap, anchored);
842        } else if (replica.mmapData instanceof Long) {
843          long lastAttemptTimeMs = (Long)replica.mmapData;
844          long delta = Time.monotonicNow() - lastAttemptTimeMs;
845          if (delta < mmapRetryTimeoutMs) {
846            if (LOG.isTraceEnabled()) {
847              LOG.trace(this + ": can't create client mmap for " +
848                  replica + " because we failed to " +
849                  "create one just " + delta + "ms ago.");
850            }
851            return null;
852          }
853          if (LOG.isTraceEnabled()) {
854            LOG.trace(this + ": retrying client mmap for " + replica +
855                ", " + delta + " ms after the previous failure.");
856          }
857        } else if (replica.mmapData instanceof Condition) {
858          Condition cond = (Condition)replica.mmapData;
859          cond.awaitUninterruptibly();
860        } else {
861          Preconditions.checkState(false, "invalid mmapData type %s",
862              replica.mmapData.getClass().getName());
863        }
864      }
865      newCond = lock.newCondition();
866      replica.mmapData = newCond;
867    } finally {
868      lock.unlock();
869    }
870    MappedByteBuffer map = replica.loadMmapInternal();
871    lock.lock();
872    try {
873      if (map == null) {
874        replica.mmapData = Long.valueOf(Time.monotonicNow());
875        newCond.signalAll();
876        return null;
877      } else {
878        outstandingMmapCount++;
879        replica.mmapData = map;
880        ref(replica);
881        newCond.signalAll();
882        return new ClientMmap(replica, map, anchored);
883      }
884    } finally {
885      lock.unlock();
886    }
887  }
888
889  /**
890   * Close the cache and free all associated resources.
891   */
892  @Override
893  public void close() {
894    try {
895      lock.lock();
896      if (closed) return;
897      closed = true;
898      LOG.info(this + ": closing");
899      maxNonMmappedEvictableLifespanMs = 0;
900      maxEvictableMmapedSize = 0;
901      // Close and join cacheCleaner thread.
902      IOUtils.cleanup(LOG, cacheCleaner);
903      // Purge all replicas.
904      while (true) {
905        Entry<Long, ShortCircuitReplica> entry = evictable.firstEntry();
906        if (entry == null) break;
907        purge(entry.getValue());
908      }
909      while (true) {
910        Entry<Long, ShortCircuitReplica> entry = evictableMmapped.firstEntry();
911        if (entry == null) break;
912        purge(entry.getValue());
913      }
914    } finally {
915      lock.unlock();
916    }
917    IOUtils.cleanup(LOG, shmManager);
918  }
919
920  @VisibleForTesting // ONLY for testing
921  public interface CacheVisitor {
922    void visit(int numOutstandingMmaps,
923        Map<ExtendedBlockId, ShortCircuitReplica> replicas,
924        Map<ExtendedBlockId, InvalidToken> failedLoads,
925        Map<Long, ShortCircuitReplica> evictable,
926        Map<Long, ShortCircuitReplica> evictableMmapped);
927  }
928
929  @VisibleForTesting // ONLY for testing
930  public void accept(CacheVisitor visitor) {
931    lock.lock();
932    try {
933      Map<ExtendedBlockId, ShortCircuitReplica> replicas =
934          new HashMap<ExtendedBlockId, ShortCircuitReplica>();
935      Map<ExtendedBlockId, InvalidToken> failedLoads =
936          new HashMap<ExtendedBlockId, InvalidToken>();
937      for (Entry<ExtendedBlockId, Waitable<ShortCircuitReplicaInfo>> entry :
938            replicaInfoMap.entrySet()) {
939        Waitable<ShortCircuitReplicaInfo> waitable = entry.getValue();
940        if (waitable.hasVal()) {
941          if (waitable.getVal().getReplica() != null) {
942            replicas.put(entry.getKey(), waitable.getVal().getReplica());
943          } else {
944            // The exception may be null here, indicating a failed load that
945            // isn't the result of an invalid block token.
946            failedLoads.put(entry.getKey(),
947                waitable.getVal().getInvalidTokenException());
948          }
949        }
950      }
951      if (LOG.isDebugEnabled()) {
952        StringBuilder builder = new StringBuilder();
953        builder.append("visiting ").append(visitor.getClass().getName()).
954            append("with outstandingMmapCount=").append(outstandingMmapCount).
955            append(", replicas=");
956        String prefix = "";
957        for (Entry<ExtendedBlockId, ShortCircuitReplica> entry : replicas.entrySet()) {
958          builder.append(prefix).append(entry.getValue());
959          prefix = ",";
960        }
961        prefix = "";
962        builder.append(", failedLoads=");
963        for (Entry<ExtendedBlockId, InvalidToken> entry : failedLoads.entrySet()) {
964          builder.append(prefix).append(entry.getValue());
965          prefix = ",";
966        }
967        prefix = "";
968        builder.append(", evictable=");
969        for (Entry<Long, ShortCircuitReplica> entry : evictable.entrySet()) {
970          builder.append(prefix).append(entry.getKey()).
971              append(":").append(entry.getValue());
972          prefix = ",";
973        }
974        prefix = "";
975        builder.append(", evictableMmapped=");
976        for (Entry<Long, ShortCircuitReplica> entry : evictableMmapped.entrySet()) {
977          builder.append(prefix).append(entry.getKey()).
978              append(":").append(entry.getValue());
979          prefix = ",";
980        }
981        LOG.debug(builder.toString());
982      }
983      visitor.visit(outstandingMmapCount, replicas, failedLoads,
984            evictable, evictableMmapped);
985    } finally {
986      lock.unlock();
987    }
988  }
989
990  @Override
991  public String toString() {
992    return "ShortCircuitCache(0x" +
993        Integer.toHexString(System.identityHashCode(this)) + ")";
994  }
995
996  /**
997   * Allocate a new shared memory slot.
998   *
999   * @param datanode       The datanode to allocate a shm slot with.
1000   * @param peer           A peer connected to the datanode.
1001   * @param usedPeer       Will be set to true if we use up the provided peer.
1002   * @param blockId        The block id and block pool id of the block we're 
1003   *                         allocating this slot for.
1004   * @param clientName     The name of the DFSClient allocating the shared
1005   *                         memory.
1006   * @return               Null if short-circuit shared memory is disabled;
1007   *                         a short-circuit memory slot otherwise.
1008   * @throws IOException   An exception if there was an error talking to 
1009   *                         the datanode.
1010   */
1011  public Slot allocShmSlot(DatanodeInfo datanode,
1012        DomainPeer peer, MutableBoolean usedPeer,
1013        ExtendedBlockId blockId, String clientName) throws IOException {
1014    if (shmManager != null) {
1015      return shmManager.allocSlot(datanode, peer, usedPeer,
1016          blockId, clientName);
1017    } else {
1018      return null;
1019    }
1020  }
1021
1022  /**
1023   * Free a slot immediately.
1024   *
1025   * ONLY use this if the DataNode is not yet aware of the slot.
1026   * 
1027   * @param slot           The slot to free.
1028   */
1029  public void freeSlot(Slot slot) {
1030    Preconditions.checkState(shmManager != null);
1031    slot.makeInvalid();
1032    shmManager.freeSlot(slot);
1033  }
1034  
1035  /**
1036   * Schedule a shared memory slot to be released.
1037   *
1038   * @param slot           The slot to release.
1039   */
1040  public void scheduleSlotReleaser(Slot slot) {
1041    Preconditions.checkState(shmManager != null);
1042    releaserExecutor.execute(new SlotReleaser(slot));
1043  }
1044
1045  @VisibleForTesting
1046  public DfsClientShmManager getDfsClientShmManager() {
1047    return shmManager;
1048  }
1049}