001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs.server.datanode;
019
020import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_SHARED_FILE_DESCRIPTOR_PATHS;
021import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_SHARED_FILE_DESCRIPTOR_PATHS_DEFAULT;
022import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS;
023import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS_DEFAULT;
024
025import java.io.Closeable;
026import java.io.FileInputStream;
027import java.io.IOException;
028import java.util.HashMap;
029import java.util.HashSet;
030import java.util.Iterator;
031import java.util.Set;
032
033import com.google.common.annotations.VisibleForTesting;
034import org.apache.commons.io.IOUtils;
035import org.apache.commons.logging.Log;
036import org.apache.commons.logging.LogFactory;
037import org.apache.hadoop.conf.Configuration;
038import org.apache.hadoop.fs.InvalidRequestException;
039import org.apache.hadoop.hdfs.ExtendedBlockId;
040import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm;
041import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.ShmId;
042import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.Slot;
043import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.SlotId;
044import org.apache.hadoop.io.nativeio.SharedFileDescriptorFactory;
045import org.apache.hadoop.net.unix.DomainSocket;
046import org.apache.hadoop.net.unix.DomainSocketWatcher;
047
048import com.google.common.base.Joiner;
049import com.google.common.base.Preconditions;
050import com.google.common.collect.HashMultimap;
051
052/**
053 * Manages client short-circuit memory segments on the DataNode.
054 *
055 * DFSClients request shared memory segments from the DataNode.  The 
056 * ShortCircuitRegistry generates and manages these segments.  Each segment
057 * has a randomly generated 128-bit ID which uniquely identifies it.  The
058 * segments each contain several "slots."
059 *
060 * Before performing a short-circuit read, DFSClients must request a pair of
061 * file descriptors from the DataNode via the REQUEST_SHORT_CIRCUIT_FDS
062 * operation.  As part of this operation, DFSClients pass the ID of the shared
063 * memory segment they would like to use to communicate information about this
064 * replica, as well as the slot number within that segment they would like to
065 * use.  Slot allocation is always done by the client.
066 *
067 * Slots are used to track the state of the block on the both the client and
068 * datanode. When this DataNode mlocks a block, the corresponding slots for the
069 * replicas are marked as "anchorable".  Anchorable blocks can be safely read
070 * without verifying the checksum.  This means that BlockReaderLocal objects
071 * using these replicas can skip checksumming.  It also means that we can do
072 * zero-copy reads on these replicas (the ZCR interface has no way of
073 * verifying checksums.)
074 * 
075 * When a DN needs to munlock a block, it needs to first wait for the block to
076 * be unanchored by clients doing a no-checksum read or a zero-copy read. The 
077 * DN also marks the block's slots as "unanchorable" to prevent additional 
078 * clients from initiating these operations in the future.
079 * 
080 * The counterpart of this class on the client is {@link DfsClientShmManager}.
081 */
082public class ShortCircuitRegistry {
083  public static final Log LOG = LogFactory.getLog(ShortCircuitRegistry.class);
084
085  private static final int SHM_LENGTH = 8192;
086
087  public static class RegisteredShm extends ShortCircuitShm
088      implements DomainSocketWatcher.Handler {
089    private final String clientName;
090    private final ShortCircuitRegistry registry;
091
092    RegisteredShm(String clientName, ShmId shmId, FileInputStream stream,
093        ShortCircuitRegistry registry) throws IOException {
094      super(shmId, stream);
095      this.clientName = clientName;
096      this.registry = registry;
097    }
098
099    @Override
100    public boolean handle(DomainSocket sock) {
101      synchronized (registry) {
102        synchronized (this) {
103          registry.removeShm(this);
104        }
105      }
106      return true;
107    }
108
109    String getClientName() {
110      return clientName;
111    }
112  }
113
114  public synchronized void removeShm(ShortCircuitShm shm) {
115    if (LOG.isTraceEnabled()) {
116      LOG.debug("removing shm " + shm);
117    }
118    // Stop tracking the shmId.
119    RegisteredShm removedShm = segments.remove(shm.getShmId());
120    Preconditions.checkState(removedShm == shm,
121        "failed to remove " + shm.getShmId());
122    // Stop tracking the slots.
123    for (Iterator<Slot> iter = shm.slotIterator(); iter.hasNext(); ) {
124      Slot slot = iter.next();
125      boolean removed = slots.remove(slot.getBlockId(), slot);
126      Preconditions.checkState(removed);
127      slot.makeInvalid();
128    }
129    // De-allocate the memory map and close the shared file. 
130    shm.free();
131  }
132
133  /**
134   * Whether or not the registry is enabled.
135   */
136  private boolean enabled;
137
138  /**
139   * The factory which creates shared file descriptors.
140   */
141  private final SharedFileDescriptorFactory shmFactory;
142  
143  /**
144   * A watcher which sends out callbacks when the UNIX domain socket
145   * associated with a shared memory segment closes.
146   */
147  private final DomainSocketWatcher watcher;
148
149  private final HashMap<ShmId, RegisteredShm> segments =
150      new HashMap<ShmId, RegisteredShm>(0);
151  
152  private final HashMultimap<ExtendedBlockId, Slot> slots =
153      HashMultimap.create(0, 1);
154  
155  public ShortCircuitRegistry(Configuration conf) throws IOException {
156    boolean enabled = false;
157    SharedFileDescriptorFactory shmFactory = null;
158    DomainSocketWatcher watcher = null;
159    try {
160      int interruptCheck = conf.getInt(
161          DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS,
162          DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS_DEFAULT);
163      if (interruptCheck <= 0) {
164        throw new IOException(
165            DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS +
166            " was set to " + interruptCheck);
167      }
168      String shmPaths[] =
169          conf.getTrimmedStrings(DFS_DATANODE_SHARED_FILE_DESCRIPTOR_PATHS);
170      if (shmPaths.length == 0) {
171        shmPaths =
172            DFS_DATANODE_SHARED_FILE_DESCRIPTOR_PATHS_DEFAULT.split(",");
173      }
174      shmFactory = SharedFileDescriptorFactory.
175          create("HadoopShortCircuitShm_", shmPaths);
176      String dswLoadingFailure = DomainSocketWatcher.getLoadingFailureReason();
177      if (dswLoadingFailure != null) {
178        throw new IOException(dswLoadingFailure);
179      }
180      watcher = new DomainSocketWatcher(interruptCheck, "datanode");
181      enabled = true;
182      if (LOG.isDebugEnabled()) {
183        LOG.debug("created new ShortCircuitRegistry with interruptCheck=" +
184                  interruptCheck + ", shmPath=" + shmFactory.getPath());
185      }
186    } catch (IOException e) {
187      if (LOG.isDebugEnabled()) {
188        LOG.debug("Disabling ShortCircuitRegistry", e);
189      }
190    } finally {
191      this.enabled = enabled;
192      this.shmFactory = shmFactory;
193      this.watcher = watcher;
194    }
195  }
196
197  /**
198   * Process a block mlock event from the FsDatasetCache.
199   *
200   * @param blockId    The block that was mlocked.
201   */
202  public synchronized void processBlockMlockEvent(ExtendedBlockId blockId) {
203    if (!enabled) return;
204    Set<Slot> affectedSlots = slots.get(blockId);
205    for (Slot slot : affectedSlots) {
206      slot.makeAnchorable();
207    }
208  }
209
210  /**
211   * Mark any slots associated with this blockId as unanchorable.
212   *
213   * @param blockId        The block ID.
214   * @return               True if we should allow the munlock request.
215   */
216  public synchronized boolean processBlockMunlockRequest(
217      ExtendedBlockId blockId) {
218    if (!enabled) return true;
219    boolean allowMunlock = true;
220    Set<Slot> affectedSlots = slots.get(blockId);
221    for (Slot slot : affectedSlots) {
222      slot.makeUnanchorable();
223      if (slot.isAnchored()) {
224        allowMunlock = false;
225      }
226    }
227    return allowMunlock;
228  }
229
230  /**
231   * Invalidate any slot associated with a blockId that we are invalidating
232   * (deleting) from this DataNode.  When a slot is invalid, the DFSClient will
233   * not use the corresponding replica for new read or mmap operations (although
234   * existing, ongoing read or mmap operations will complete.)
235   *
236   * @param blockId        The block ID.
237   */
238  public synchronized void processBlockInvalidation(ExtendedBlockId blockId) {
239    if (!enabled) return;
240    final Set<Slot> affectedSlots = slots.get(blockId);
241    if (!affectedSlots.isEmpty()) {
242      final StringBuilder bld = new StringBuilder();
243      String prefix = "";
244      bld.append("Block ").append(blockId).append(" has been invalidated.  ").
245          append("Marking short-circuit slots as invalid: ");
246      for (Slot slot : affectedSlots) {
247        slot.makeInvalid();
248        bld.append(prefix).append(slot.toString());
249        prefix = ", ";
250      }
251      LOG.info(bld.toString());
252    }
253  }
254
255  public synchronized String getClientNames(ExtendedBlockId blockId) {
256    if (!enabled) return "";
257    final HashSet<String> clientNames = new HashSet<String>();
258    final Set<Slot> affectedSlots = slots.get(blockId);
259    for (Slot slot : affectedSlots) {
260      clientNames.add(((RegisteredShm)slot.getShm()).getClientName());
261    }
262    return Joiner.on(",").join(clientNames);
263  }
264
265  public static class NewShmInfo implements Closeable {
266    public final ShmId shmId;
267    public final FileInputStream stream;
268
269    NewShmInfo(ShmId shmId, FileInputStream stream) {
270      this.shmId = shmId;
271      this.stream = stream;
272    }
273
274    @Override
275    public void close() throws IOException {
276      stream.close();
277    }
278  }
279
280  /**
281   * Handle a DFSClient request to create a new memory segment.
282   *
283   * @param clientName    Client name as reported by the client.
284   * @param sock          The DomainSocket to associate with this memory
285   *                        segment.  When this socket is closed, or the
286   *                        other side writes anything to the socket, the
287   *                        segment will be closed.  This can happen at any
288   *                        time, including right after this function returns.
289   * @return              A NewShmInfo object.  The caller must close the
290   *                        NewShmInfo object once they are done with it.
291   * @throws IOException  If the new memory segment could not be created.
292   */
293  public NewShmInfo createNewMemorySegment(String clientName,
294      DomainSocket sock) throws IOException {
295    NewShmInfo info = null;
296    RegisteredShm shm = null;
297    ShmId shmId = null;
298    synchronized (this) {
299      if (!enabled) {
300        if (LOG.isTraceEnabled()) {
301          LOG.trace("createNewMemorySegment: ShortCircuitRegistry is " +
302              "not enabled.");
303        }
304        throw new UnsupportedOperationException();
305      }
306      FileInputStream fis = null;
307      try {
308        do {
309          shmId = ShmId.createRandom();
310        } while (segments.containsKey(shmId));
311        fis = shmFactory.createDescriptor(clientName, SHM_LENGTH);
312        shm = new RegisteredShm(clientName, shmId, fis, this);
313      } finally {
314        if (shm == null) {
315          IOUtils.closeQuietly(fis);
316        }
317      }
318      info = new NewShmInfo(shmId, fis);
319      segments.put(shmId, shm);
320    }
321    // Drop the registry lock to prevent deadlock.
322    // After this point, RegisteredShm#handle may be called at any time.
323    watcher.add(sock, shm);
324    if (LOG.isTraceEnabled()) {
325      LOG.trace("createNewMemorySegment: created " + info.shmId);
326    }
327    return info;
328  }
329  
330  public synchronized void registerSlot(ExtendedBlockId blockId, SlotId slotId,
331      boolean isCached) throws InvalidRequestException {
332    if (!enabled) {
333      if (LOG.isTraceEnabled()) {
334        LOG.trace(this + " can't register a slot because the " +
335            "ShortCircuitRegistry is not enabled.");
336      }
337      throw new UnsupportedOperationException();
338    }
339    ShmId shmId = slotId.getShmId();
340    RegisteredShm shm = segments.get(shmId);
341    if (shm == null) {
342      throw new InvalidRequestException("there is no shared memory segment " +
343          "registered with shmId " + shmId);
344    }
345    Slot slot = shm.registerSlot(slotId.getSlotIdx(), blockId);
346    if (isCached) {
347      slot.makeAnchorable();
348    } else {
349      slot.makeUnanchorable();
350    }
351    boolean added = slots.put(blockId, slot);
352    Preconditions.checkState(added);
353    if (LOG.isTraceEnabled()) {
354      LOG.trace(this + ": registered " + blockId + " with slot " +
355        slotId + " (isCached=" + isCached + ")");
356    }
357  }
358  
359  public synchronized void unregisterSlot(SlotId slotId)
360      throws InvalidRequestException {
361    if (!enabled) {
362      if (LOG.isTraceEnabled()) {
363        LOG.trace("unregisterSlot: ShortCircuitRegistry is " +
364            "not enabled.");
365      }
366      throw new UnsupportedOperationException();
367    }
368    ShmId shmId = slotId.getShmId();
369    RegisteredShm shm = segments.get(shmId);
370    if (shm == null) {
371      throw new InvalidRequestException("there is no shared memory segment " +
372          "registered with shmId " + shmId);
373    }
374    Slot slot = shm.getSlot(slotId.getSlotIdx());
375    slot.makeInvalid();
376    shm.unregisterSlot(slotId.getSlotIdx());
377    slots.remove(slot.getBlockId(), slot);
378  }
379  
380  public void shutdown() {
381    synchronized (this) {
382      if (!enabled) return;
383      enabled = false;
384    }
385    IOUtils.closeQuietly(watcher);
386  }
387
388  public static interface Visitor {
389    void accept(HashMap<ShmId, RegisteredShm> segments,
390                HashMultimap<ExtendedBlockId, Slot> slots);
391  }
392
393  @VisibleForTesting
394  public synchronized void visit(Visitor visitor) {
395    visitor.accept(segments, slots);
396  }
397}