001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs.shortcircuit;
019
020import java.io.FileInputStream;
021import java.io.IOException;
022import java.lang.reflect.Field;
023import java.util.BitSet;
024import java.util.Iterator;
025import java.util.NoSuchElementException;
026import java.util.Random;
027
028import org.apache.commons.lang.builder.EqualsBuilder;
029import org.apache.commons.lang.builder.HashCodeBuilder;
030import org.apache.commons.logging.Log;
031import org.apache.commons.logging.LogFactory;
032import org.apache.hadoop.fs.InvalidRequestException;
033import org.apache.hadoop.hdfs.ExtendedBlockId;
034import org.apache.hadoop.io.nativeio.NativeIO;
035import org.apache.hadoop.io.nativeio.NativeIO.POSIX;
036import org.apache.hadoop.util.Shell;
037import org.apache.hadoop.util.StringUtils;
038
039import sun.misc.Unsafe;
040
041import com.google.common.base.Preconditions;
042import com.google.common.collect.ComparisonChain;
043import com.google.common.primitives.Ints;
044
045/**
046 * A shared memory segment used to implement short-circuit reads.
047 */
048public class ShortCircuitShm {
049  private static final Log LOG = LogFactory.getLog(ShortCircuitShm.class);
050
051  protected static final int BYTES_PER_SLOT = 64;
052
053  private static final Unsafe unsafe = safetyDance();
054
055  private static Unsafe safetyDance() {
056    try {
057      Field f = Unsafe.class.getDeclaredField("theUnsafe");
058      f.setAccessible(true);
059      return (Unsafe)f.get(null);
060    } catch (Throwable e) {
061      LOG.error("failed to load misc.Unsafe", e);
062    }
063    return null;
064  }
065
066  /**
067   * Calculate the usable size of a shared memory segment.
068   * We round down to a multiple of the slot size and do some validation.
069   *
070   * @param stream The stream we're using.
071   * @return       The usable size of the shared memory segment.
072   */
073  private static int getUsableLength(FileInputStream stream)
074      throws IOException {
075    int intSize = Ints.checkedCast(stream.getChannel().size());
076    int slots = intSize / BYTES_PER_SLOT;
077    if (slots == 0) {
078      throw new IOException("size of shared memory segment was " +
079          intSize + ", but that is not enough to hold even one slot.");
080    }
081    return slots * BYTES_PER_SLOT;
082  }
083
084  /**
085   * Identifies a DfsClientShm.
086   */
087  public static class ShmId implements Comparable<ShmId> {
088    private static final Random random = new Random();
089    private final long hi;
090    private final long lo;
091
092    /**
093     * Generate a random ShmId.
094     * 
095     * We generate ShmIds randomly to prevent a malicious client from
096     * successfully guessing one and using that to interfere with another
097     * client.
098     */
099    public static ShmId createRandom() {
100      return new ShmId(random.nextLong(), random.nextLong());
101    }
102
103    public ShmId(long hi, long lo) {
104      this.hi = hi;
105      this.lo = lo;
106    }
107    
108    public long getHi() {
109      return hi;
110    }
111    
112    public long getLo() {
113      return lo;
114    }
115
116    @Override
117    public boolean equals(Object o) {
118      if ((o == null) || (o.getClass() != this.getClass())) {
119        return false;
120      }
121      ShmId other = (ShmId)o;
122      return new EqualsBuilder().
123          append(hi, other.hi).
124          append(lo, other.lo).
125          isEquals();
126    }
127
128    @Override
129    public int hashCode() {
130      return new HashCodeBuilder().
131          append(this.hi).
132          append(this.lo).
133          toHashCode();
134    }
135
136    @Override
137    public String toString() {
138      return String.format("%016x%016x", hi, lo);
139    }
140
141    @Override
142    public int compareTo(ShmId other) {
143      return ComparisonChain.start().
144          compare(hi, other.hi).
145          compare(lo, other.lo).
146          result();
147    }
148  };
149
150  /**
151   * Uniquely identifies a slot.
152   */
153  public static class SlotId {
154    private final ShmId shmId;
155    private final int slotIdx;
156    
157    public SlotId(ShmId shmId, int slotIdx) {
158      this.shmId = shmId;
159      this.slotIdx = slotIdx;
160    }
161
162    public ShmId getShmId() {
163      return shmId;
164    }
165
166    public int getSlotIdx() {
167      return slotIdx;
168    }
169
170    @Override
171    public boolean equals(Object o) {
172      if ((o == null) || (o.getClass() != this.getClass())) {
173        return false;
174      }
175      SlotId other = (SlotId)o;
176      return new EqualsBuilder().
177          append(shmId, other.shmId).
178          append(slotIdx, other.slotIdx).
179          isEquals();
180    }
181
182    @Override
183    public int hashCode() {
184      return new HashCodeBuilder().
185          append(this.shmId).
186          append(this.slotIdx).
187          toHashCode();
188    }
189
190    @Override
191    public String toString() {
192      return String.format("SlotId(%s:%d)", shmId.toString(), slotIdx);
193    }
194  }
195
196  public class SlotIterator implements Iterator<Slot> {
197    int slotIdx = -1;
198
199    @Override
200    public boolean hasNext() {
201      synchronized (ShortCircuitShm.this) {
202        return allocatedSlots.nextSetBit(slotIdx + 1) != -1;
203      }
204    }
205
206    @Override
207    public Slot next() {
208      synchronized (ShortCircuitShm.this) {
209        int nextSlotIdx = allocatedSlots.nextSetBit(slotIdx + 1);
210        if (nextSlotIdx == -1) {
211          throw new NoSuchElementException();
212        }
213        slotIdx = nextSlotIdx;
214        return slots[nextSlotIdx];
215      }
216    }
217
218    @Override
219    public void remove() {
220      throw new UnsupportedOperationException("SlotIterator " +
221          "doesn't support removal");
222    }
223  }
224  
225  /**
226   * A slot containing information about a replica.
227   *
228   * The format is:
229   * word 0
230   *   bit 0:32   Slot flags (see below).
231   *   bit 33:63  Anchor count.
232   * word 1:7
233   *   Reserved for future use, such as statistics.
234   *   Padding is also useful for avoiding false sharing.
235   *
236   * Little-endian versus big-endian is not relevant here since both the client
237   * and the server reside on the same computer and use the same orientation.
238   */
239  public class Slot {
240    /**
241     * Flag indicating that the slot is valid.  
242     * 
243     * The DFSClient sets this flag when it allocates a new slot within one of
244     * its shared memory regions.
245     * 
246     * The DataNode clears this flag when the replica associated with this slot
247     * is no longer valid.  The client itself also clears this flag when it
248     * believes that the DataNode is no longer using this slot to communicate.
249     */
250    private static final long VALID_FLAG =          1L<<63;
251
252    /**
253     * Flag indicating that the slot can be anchored.
254     */
255    private static final long ANCHORABLE_FLAG =     1L<<62;
256
257    /**
258     * The slot address in memory.
259     */
260    private final long slotAddress;
261
262    /**
263     * BlockId of the block this slot is used for.
264     */
265    private final ExtendedBlockId blockId;
266
267    Slot(long slotAddress, ExtendedBlockId blockId) {
268      this.slotAddress = slotAddress;
269      this.blockId = blockId;
270    }
271
272    /**
273     * Get the short-circuit memory segment associated with this Slot.
274     *
275     * @return      The enclosing short-circuit memory segment.
276     */
277    public ShortCircuitShm getShm() {
278      return ShortCircuitShm.this;
279    }
280
281    /**
282     * Get the ExtendedBlockId associated with this slot.
283     *
284     * @return      The ExtendedBlockId of this slot.
285     */
286    public ExtendedBlockId getBlockId() {
287      return blockId;
288    }
289
290    /**
291     * Get the SlotId of this slot, containing both shmId and slotIdx.
292     *
293     * @return      The SlotId of this slot.
294     */
295    public SlotId getSlotId() {
296      return new SlotId(getShmId(), getSlotIdx());
297    }
298
299    /**
300     * Get the Slot index.
301     *
302     * @return      The index of this slot.
303     */
304    public int getSlotIdx() {
305      return Ints.checkedCast(
306          (slotAddress - baseAddress) / BYTES_PER_SLOT);
307    }
308
309    /**
310     * Clear the slot.
311     */
312    void clear() {
313      unsafe.putLongVolatile(null, this.slotAddress, 0);
314    }
315
316    private boolean isSet(long flag) {
317      long prev = unsafe.getLongVolatile(null, this.slotAddress);
318      return (prev & flag) != 0;
319    }
320
321    private void setFlag(long flag) {
322      long prev;
323      do {
324        prev = unsafe.getLongVolatile(null, this.slotAddress);
325        if ((prev & flag) != 0) {
326          return;
327        }
328      } while (!unsafe.compareAndSwapLong(null, this.slotAddress,
329                  prev, prev | flag));
330    }
331
332    private void clearFlag(long flag) {
333      long prev;
334      do {
335        prev = unsafe.getLongVolatile(null, this.slotAddress);
336        if ((prev & flag) == 0) {
337          return;
338        }
339      } while (!unsafe.compareAndSwapLong(null, this.slotAddress,
340                  prev, prev & (~flag)));
341    }
342    
343    public boolean isValid() {
344      return isSet(VALID_FLAG);
345    }
346
347    public void makeValid() {
348      setFlag(VALID_FLAG);
349    }
350
351    public void makeInvalid() {
352      clearFlag(VALID_FLAG);
353    }
354
355    public boolean isAnchorable() {
356      return isSet(ANCHORABLE_FLAG);
357    }
358
359    public void makeAnchorable() {
360      setFlag(ANCHORABLE_FLAG);
361    }
362
363    public void makeUnanchorable() {
364      clearFlag(ANCHORABLE_FLAG);
365    }
366
367    public boolean isAnchored() {
368      long prev = unsafe.getLongVolatile(null, this.slotAddress);
369      if ((prev & VALID_FLAG) == 0) {
370        // Slot is no longer valid.
371        return false;
372      }
373      return ((prev & 0x7fffffff) != 0);
374    }
375
376    /**
377     * Try to add an anchor for a given slot.
378     *
379     * When a slot is anchored, we know that the block it refers to is resident
380     * in memory.
381     *
382     * @return          True if the slot is anchored.
383     */
384    public boolean addAnchor() {
385      long prev;
386      do {
387        prev = unsafe.getLongVolatile(null, this.slotAddress);
388        if ((prev & VALID_FLAG) == 0) {
389          // Slot is no longer valid.
390          return false;
391        }
392        if ((prev & ANCHORABLE_FLAG) == 0) {
393          // Slot can't be anchored right now.
394          return false;
395        }
396        if ((prev & 0x7fffffff) == 0x7fffffff) {
397          // Too many other threads have anchored the slot (2 billion?)
398          return false;
399        }
400      } while (!unsafe.compareAndSwapLong(null, this.slotAddress,
401                  prev, prev + 1));
402      return true;
403    }
404
405    /**
406     * Remove an anchor for a given slot.
407     */
408    public void removeAnchor() {
409      long prev;
410      do {
411        prev = unsafe.getLongVolatile(null, this.slotAddress);
412        Preconditions.checkState((prev & 0x7fffffff) != 0,
413            "Tried to remove anchor for slot " + slotAddress +", which was " +
414            "not anchored.");
415      } while (!unsafe.compareAndSwapLong(null, this.slotAddress,
416                  prev, prev - 1));
417    }
418
419    @Override
420    public String toString() {
421      return "Slot(slotIdx=" + getSlotIdx() + ", shm=" + getShm() + ")";
422    }
423  }
424
425  /**
426   * ID for this SharedMemorySegment.
427   */
428  private final ShmId shmId;
429
430  /**
431   * The base address of the memory-mapped file.
432   */
433  private final long baseAddress;
434
435  /**
436   * The mmapped length of the shared memory segment
437   */
438  private final int mmappedLength;
439
440  /**
441   * The slots associated with this shared memory segment.
442   * slot[i] contains the slot at offset i * BYTES_PER_SLOT,
443   * or null if that slot is not allocated.
444   */
445  private final Slot slots[];
446
447  /**
448   * A bitset where each bit represents a slot which is in use.
449   */
450  private final BitSet allocatedSlots;
451
452  /**
453   * Create the ShortCircuitShm.
454   * 
455   * @param shmId       The ID to use.
456   * @param stream      The stream that we're going to use to create this 
457   *                    shared memory segment.
458   *                    
459   *                    Although this is a FileInputStream, we are going to
460   *                    assume that the underlying file descriptor is writable
461   *                    as well as readable. It would be more appropriate to use
462   *                    a RandomAccessFile here, but that class does not have
463   *                    any public accessor which returns a FileDescriptor,
464   *                    unlike FileInputStream.
465   */
466  public ShortCircuitShm(ShmId shmId, FileInputStream stream)
467        throws IOException {
468    if (!NativeIO.isAvailable()) {
469      throw new UnsupportedOperationException("NativeIO is not available.");
470    }
471    if (Shell.WINDOWS) {
472      throw new UnsupportedOperationException(
473          "DfsClientShm is not yet implemented for Windows.");
474    }
475    if (unsafe == null) {
476      throw new UnsupportedOperationException(
477          "can't use DfsClientShm because we failed to " +
478          "load misc.Unsafe.");
479    }
480    this.shmId = shmId;
481    this.mmappedLength = getUsableLength(stream);
482    this.baseAddress = POSIX.mmap(stream.getFD(), 
483        POSIX.MMAP_PROT_READ | POSIX.MMAP_PROT_WRITE, true, mmappedLength);
484    this.slots = new Slot[mmappedLength / BYTES_PER_SLOT];
485    this.allocatedSlots = new BitSet(slots.length);
486    if (LOG.isTraceEnabled()) {
487      LOG.trace("creating " + this.getClass().getSimpleName() +
488          "(shmId=" + shmId +
489          ", mmappedLength=" + mmappedLength +
490          ", baseAddress=" + String.format("%x", baseAddress) +
491          ", slots.length=" + slots.length + ")");
492    }
493  }
494
495  public final ShmId getShmId() {
496    return shmId;
497  }
498  
499  /**
500   * Determine if this shared memory object is empty.
501   *
502   * @return    True if the shared memory object is empty.
503   */
504  synchronized final public boolean isEmpty() {
505    return allocatedSlots.nextSetBit(0) == -1;
506  }
507
508  /**
509   * Determine if this shared memory object is full.
510   *
511   * @return    True if the shared memory object is full.
512   */
513  synchronized final public boolean isFull() {
514    return allocatedSlots.nextClearBit(0) >= slots.length;
515  }
516
517  /**
518   * Calculate the base address of a slot.
519   *
520   * @param slotIdx   Index of the slot.
521   * @return          The base address of the slot.
522   */
523  private final long calculateSlotAddress(int slotIdx) {
524    long offset = slotIdx;
525    offset *= BYTES_PER_SLOT;
526    return this.baseAddress + offset;
527  }
528
529  /**
530   * Allocate a new slot and register it.
531   *
532   * This function chooses an empty slot, initializes it, and then returns
533   * the relevant Slot object.
534   *
535   * @return    The new slot.
536   */
537  synchronized public final Slot allocAndRegisterSlot(
538      ExtendedBlockId blockId) {
539    int idx = allocatedSlots.nextClearBit(0);
540    if (idx >= slots.length) {
541      throw new RuntimeException(this + ": no more slots are available.");
542    }
543    allocatedSlots.set(idx, true);
544    Slot slot = new Slot(calculateSlotAddress(idx), blockId);
545    slot.clear();
546    slot.makeValid();
547    slots[idx] = slot;
548    if (LOG.isTraceEnabled()) {
549      LOG.trace(this + ": allocAndRegisterSlot " + idx + ": allocatedSlots=" + allocatedSlots +
550                  StringUtils.getStackTrace(Thread.currentThread()));
551    }
552    return slot;
553  }
554
555  synchronized public final Slot getSlot(int slotIdx)
556      throws InvalidRequestException {
557    if (!allocatedSlots.get(slotIdx)) {
558      throw new InvalidRequestException(this + ": slot " + slotIdx +
559          " does not exist.");
560    }
561    return slots[slotIdx];
562  }
563
564  /**
565   * Register a slot.
566   *
567   * This function looks at a slot which has already been initialized (by
568   * another process), and registers it with us.  Then, it returns the 
569   * relevant Slot object.
570   *
571   * @return    The slot.
572   *
573   * @throws InvalidRequestException
574   *            If the slot index we're trying to allocate has not been
575   *            initialized, or is already in use.
576   */
577  synchronized public final Slot registerSlot(int slotIdx,
578      ExtendedBlockId blockId) throws InvalidRequestException {
579    if (slotIdx < 0) {
580      throw new InvalidRequestException(this + ": invalid negative slot " +
581          "index " + slotIdx);
582    }
583    if (slotIdx >= slots.length) {
584      throw new InvalidRequestException(this + ": invalid slot " +
585          "index " + slotIdx);
586    }
587    if (allocatedSlots.get(slotIdx)) {
588      throw new InvalidRequestException(this + ": slot " + slotIdx +
589          " is already in use.");
590    }
591    Slot slot = new Slot(calculateSlotAddress(slotIdx), blockId);
592    if (!slot.isValid()) {
593      throw new InvalidRequestException(this + ": slot " + slotIdx +
594          " is not marked as valid.");
595    }
596    slots[slotIdx] = slot;
597    allocatedSlots.set(slotIdx, true);
598    if (LOG.isTraceEnabled()) {
599      LOG.trace(this + ": registerSlot " + slotIdx + ": allocatedSlots=" + allocatedSlots +
600                  StringUtils.getStackTrace(Thread.currentThread()));
601    }
602    return slot;
603  }
604
605  /**
606   * Unregisters a slot.
607   * 
608   * This doesn't alter the contents of the slot.  It just means
609   *
610   * @param slotIdx  Index of the slot to unregister.
611   */
612  synchronized public final void unregisterSlot(int slotIdx) {
613    Preconditions.checkState(allocatedSlots.get(slotIdx),
614        "tried to unregister slot " + slotIdx + ", which was not registered.");
615    allocatedSlots.set(slotIdx, false);
616    slots[slotIdx] = null;
617    if (LOG.isTraceEnabled()) {
618      LOG.trace(this + ": unregisterSlot " + slotIdx);
619    }
620  }
621  
622  /**
623   * Iterate over all allocated slots.
624   * 
625   * Note that this method isn't safe if 
626   *
627   * @return        The slot iterator.
628   */
629  public SlotIterator slotIterator() {
630    return new SlotIterator();
631  }
632
633  public void free() {
634    try {
635      POSIX.munmap(baseAddress, mmappedLength);
636    } catch (IOException e) {
637      LOG.warn(this + ": failed to munmap", e);
638    }
639    LOG.trace(this + ": freed");
640  }
641  
642  @Override
643  public String toString() {
644    return this.getClass().getSimpleName() + "(" + shmId + ")";
645  }
646}