001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs.shortcircuit;
019
020import java.io.BufferedOutputStream;
021import java.io.Closeable;
022import java.io.DataOutputStream;
023import java.io.EOFException;
024import java.io.FileInputStream;
025import java.io.IOException;
026import java.util.HashMap;
027import java.util.Map.Entry;
028import java.util.TreeMap;
029import java.util.concurrent.locks.Condition;
030import java.util.concurrent.locks.ReentrantLock;
031
032import org.apache.commons.lang.mutable.MutableBoolean;
033import org.apache.commons.logging.Log;
034import org.apache.commons.logging.LogFactory;
035import org.apache.hadoop.classification.InterfaceAudience;
036import org.apache.hadoop.hdfs.ExtendedBlockId;
037import org.apache.hadoop.hdfs.net.DomainPeer;
038import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
039import org.apache.hadoop.hdfs.protocol.datatransfer.DataTransferProtocol;
040import org.apache.hadoop.hdfs.protocol.datatransfer.Sender;
041import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.ShortCircuitShmResponseProto;
042import org.apache.hadoop.hdfs.protocolPB.PBHelper;
043import org.apache.hadoop.hdfs.server.datanode.ShortCircuitRegistry;
044import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.ShmId;
045import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.Slot;
046import org.apache.hadoop.io.IOUtils;
047import org.apache.hadoop.net.unix.DomainSocket;
048import org.apache.hadoop.net.unix.DomainSocketWatcher;
049
050import com.google.common.annotations.VisibleForTesting;
051import com.google.common.base.Preconditions;
052
053/**
054 * Manages short-circuit memory segments for an HDFS client.
055 * 
056 * Clients are responsible for requesting and releasing shared memory segments used
057 * for communicating with the DataNode. The client will try to allocate new slots
058 * in the set of existing segments, falling back to getting a new segment from the
059 * DataNode via {@link DataTransferProtocol#requestShortCircuitFds}.
060 * 
061 * The counterpart to this class on the DataNode is {@link ShortCircuitRegistry}.
062 * See {@link ShortCircuitRegistry} for more information on the communication protocol.
063 */
064@InterfaceAudience.Private
065public class DfsClientShmManager implements Closeable {
066  private static final Log LOG = LogFactory.getLog(DfsClientShmManager.class);
067
068  /**
069   * Manages short-circuit memory segments that pertain to a given DataNode.
070   */
071  class EndpointShmManager {
072    /**
073     * The datanode we're managing.
074     */
075    private final DatanodeInfo datanode;
076
077    /**
078     * Shared memory segments which have no empty slots.
079     *
080     * Protected by the manager lock.
081     */
082    private final TreeMap<ShmId, DfsClientShm> full =
083        new TreeMap<ShmId, DfsClientShm>();
084
085    /**
086     * Shared memory segments which have at least one empty slot.
087     *
088     * Protected by the manager lock.
089     */
090    private final TreeMap<ShmId, DfsClientShm> notFull =
091        new TreeMap<ShmId, DfsClientShm>();
092
093    /**
094     * True if this datanode doesn't support short-circuit shared memory
095     * segments.
096     *
097     * Protected by the manager lock.
098     */
099    private boolean disabled = false;
100
101    /**
102     * True if we're in the process of loading a shared memory segment from
103     * this DataNode.
104     *
105     * Protected by the manager lock.
106     */
107    private boolean loading = false;
108
109    EndpointShmManager (DatanodeInfo datanode) {
110      this.datanode = datanode;
111    }
112
113    /**
114     * Pull a slot out of a preexisting shared memory segment.
115     *
116     * Must be called with the manager lock held.
117     *
118     * @param blockId     The blockId to put inside the Slot object.
119     *
120     * @return            null if none of our shared memory segments contain a
121     *                      free slot; the slot object otherwise.
122     */
123    private Slot allocSlotFromExistingShm(ExtendedBlockId blockId) {
124      if (notFull.isEmpty()) {
125        return null;
126      }
127      Entry<ShmId, DfsClientShm> entry = notFull.firstEntry();
128      DfsClientShm shm = entry.getValue();
129      ShmId shmId = shm.getShmId();
130      Slot slot = shm.allocAndRegisterSlot(blockId);
131      if (shm.isFull()) {
132        if (LOG.isTraceEnabled()) {
133          LOG.trace(this + ": pulled the last slot " + slot.getSlotIdx() +
134              " out of " + shm);
135        }
136        DfsClientShm removedShm = notFull.remove(shmId);
137        Preconditions.checkState(removedShm == shm);
138        full.put(shmId, shm);
139      } else {
140        if (LOG.isTraceEnabled()) {
141          LOG.trace(this + ": pulled slot " + slot.getSlotIdx() +
142              " out of " + shm);
143        }
144      }
145      return slot;
146    }
147
148    /**
149     * Ask the DataNode for a new shared memory segment.  This function must be
150     * called with the manager lock held.  We will release the lock while
151     * communicating with the DataNode.
152     *
153     * @param clientName    The current client name.
154     * @param peer          The peer to use to talk to the DataNode.
155     *
156     * @return              Null if the DataNode does not support shared memory
157     *                        segments, or experienced an error creating the
158     *                        shm.  The shared memory segment itself on success.
159     * @throws IOException  If there was an error communicating over the socket.
160     *                        We will not throw an IOException unless the socket
161     *                        itself (or the network) is the problem.
162     */
163    private DfsClientShm requestNewShm(String clientName, DomainPeer peer)
164        throws IOException {
165      final DataOutputStream out = 
166          new DataOutputStream(
167              new BufferedOutputStream(peer.getOutputStream()));
168      new Sender(out).requestShortCircuitShm(clientName);
169      ShortCircuitShmResponseProto resp = 
170          ShortCircuitShmResponseProto.parseFrom(
171              PBHelper.vintPrefixed(peer.getInputStream()));
172      String error = resp.hasError() ? resp.getError() : "(unknown)";
173      switch (resp.getStatus()) {
174      case SUCCESS:
175        DomainSocket sock = peer.getDomainSocket();
176        byte buf[] = new byte[1];
177        FileInputStream fis[] = new FileInputStream[1];
178        if (sock.recvFileInputStreams(fis, buf, 0, buf.length) < 0) {
179          throw new EOFException("got EOF while trying to transfer the " +
180              "file descriptor for the shared memory segment.");
181        }
182        if (fis[0] == null) {
183          throw new IOException("the datanode " + datanode + " failed to " +
184              "pass a file descriptor for the shared memory segment.");
185        }
186        try {
187          DfsClientShm shm = 
188              new DfsClientShm(PBHelper.convert(resp.getId()),
189                  fis[0], this, peer);
190          if (LOG.isTraceEnabled()) {
191            LOG.trace(this + ": createNewShm: created " + shm);
192          }
193          return shm;
194        } finally {
195          IOUtils.cleanup(LOG,  fis[0]);
196        }
197      case ERROR_UNSUPPORTED:
198        // The DataNode just does not support short-circuit shared memory
199        // access, and we should stop asking.
200        LOG.info(this + ": datanode does not support short-circuit " +
201            "shared memory access: " + error);
202        disabled = true;
203        return null;
204      default:
205        // The datanode experienced some kind of unexpected error when trying to
206        // create the short-circuit shared memory segment.
207        LOG.warn(this + ": error requesting short-circuit shared memory " +
208            "access: " + error);
209        return null;
210      }
211    }
212
213    /**
214     * Allocate a new shared memory slot connected to this datanode.
215     *
216     * Must be called with the EndpointShmManager lock held.
217     *
218     * @param peer          The peer to use to talk to the DataNode.
219     * @param usedPeer      (out param) Will be set to true if we used the peer.
220     *                        When a peer is used
221     *
222     * @param clientName    The client name.
223     * @param blockId       The block ID to use.
224     * @return              null if the DataNode does not support shared memory
225     *                        segments, or experienced an error creating the
226     *                        shm.  The shared memory segment itself on success.
227     * @throws IOException  If there was an error communicating over the socket.
228     */
229    Slot allocSlot(DomainPeer peer, MutableBoolean usedPeer,
230        String clientName, ExtendedBlockId blockId) throws IOException {
231      while (true) {
232        if (closed) {
233          if (LOG.isTraceEnabled()) {
234            LOG.trace(this + ": the DfsClientShmManager has been closed.");
235          }
236          return null;
237        }
238        if (disabled) {
239          if (LOG.isTraceEnabled()) {
240            LOG.trace(this + ": shared memory segment access is disabled.");
241          }
242          return null;
243        }
244        // Try to use an existing slot.
245        Slot slot = allocSlotFromExistingShm(blockId);
246        if (slot != null) {
247          return slot;
248        }
249        // There are no free slots.  If someone is loading more slots, wait
250        // for that to finish.
251        if (loading) {
252          if (LOG.isTraceEnabled()) {
253            LOG.trace(this + ": waiting for loading to finish...");
254          }
255          finishedLoading.awaitUninterruptibly();
256        } else {
257          // Otherwise, load the slot ourselves.
258          loading = true;
259          lock.unlock();
260          DfsClientShm shm;
261          try {
262            shm = requestNewShm(clientName, peer);
263            if (shm == null) continue;
264            // See #{DfsClientShmManager#domainSocketWatcher} for details
265            // about why we do this before retaking the manager lock.
266            domainSocketWatcher.add(peer.getDomainSocket(), shm);
267            // The DomainPeer is now our responsibility, and should not be
268            // closed by the caller.
269            usedPeer.setValue(true);
270          } finally {
271            lock.lock();
272            loading = false;
273            finishedLoading.signalAll();
274          }
275          if (shm.isDisconnected()) {
276            // If the peer closed immediately after the shared memory segment
277            // was created, the DomainSocketWatcher callback might already have
278            // fired and marked the shm as disconnected.  In this case, we
279            // obviously don't want to add the SharedMemorySegment to our list
280            // of valid not-full segments.
281            if (LOG.isDebugEnabled()) {
282              LOG.debug(this + ": the UNIX domain socket associated with " +
283                  "this short-circuit memory closed before we could make " +
284                  "use of the shm.");
285            }
286          } else {
287            notFull.put(shm.getShmId(), shm);
288          }
289        }
290      }
291    }
292    
293    /**
294     * Stop tracking a slot.
295     *
296     * Must be called with the EndpointShmManager lock held.
297     *
298     * @param slot          The slot to release.
299     */
300    void freeSlot(Slot slot) {
301      DfsClientShm shm = (DfsClientShm)slot.getShm();
302      shm.unregisterSlot(slot.getSlotIdx());
303      if (shm.isDisconnected()) {
304        // Stale shared memory segments should not be tracked here.
305        Preconditions.checkState(!full.containsKey(shm.getShmId()));
306        Preconditions.checkState(!notFull.containsKey(shm.getShmId()));
307        if (shm.isEmpty()) {
308          if (LOG.isTraceEnabled()) {
309            LOG.trace(this + ": freeing empty stale " + shm);
310          }
311          shm.free();
312        }
313      } else {
314        ShmId shmId = shm.getShmId();
315        full.remove(shmId); // The shm can't be full if we just freed a slot.
316        if (shm.isEmpty()) {
317          notFull.remove(shmId);
318  
319          // If the shared memory segment is now empty, we call shutdown(2) on
320          // the UNIX domain socket associated with it.  The DomainSocketWatcher,
321          // which is watching this socket, will call DfsClientShm#handle,
322          // cleaning up this shared memory segment.
323          //
324          // See #{DfsClientShmManager#domainSocketWatcher} for details about why
325          // we don't want to call DomainSocketWatcher#remove directly here.
326          //
327          // Note that we could experience 'fragmentation' here, where the
328          // DFSClient allocates a bunch of slots in different shared memory
329          // segments, and then frees most of them, but never fully empties out
330          // any segment.  We make some attempt to avoid this fragmentation by
331          // always allocating new slots out of the shared memory segment with the
332          // lowest ID, but it could still occur.  In most workloads,
333          // fragmentation should not be a major concern, since it doesn't impact
334          // peak file descriptor usage or the speed of allocation.
335          if (LOG.isTraceEnabled()) {
336            LOG.trace(this + ": shutting down UNIX domain socket for " +
337                "empty " + shm);
338          }
339          shutdown(shm);
340        } else {
341          notFull.put(shmId, shm);
342        }
343      }
344    }
345    
346    /**
347     * Unregister a shared memory segment.
348     *
349     * Once a segment is unregistered, we will not allocate any more slots
350     * inside that segment.
351     *
352     * The DomainSocketWatcher calls this while holding the DomainSocketWatcher
353     * lock.
354     *
355     * @param shmId         The ID of the shared memory segment to unregister.
356     */
357    void unregisterShm(ShmId shmId) {
358      lock.lock();
359      try {
360        full.remove(shmId);
361        notFull.remove(shmId);
362      } finally {
363        lock.unlock();
364      }
365    }
366
367    @Override
368    public String toString() {
369      return String.format("EndpointShmManager(%s, parent=%s)",
370          datanode, DfsClientShmManager.this);
371    }
372
373    PerDatanodeVisitorInfo getVisitorInfo() {
374      return new PerDatanodeVisitorInfo(full, notFull, disabled);
375    }
376
377    final void shutdown(DfsClientShm shm) {
378      try {
379        shm.getPeer().getDomainSocket().shutdown();
380      } catch (IOException e) {
381        LOG.warn(this + ": error shutting down shm: got IOException calling " +
382            "shutdown(SHUT_RDWR)", e);
383      }
384    }
385  }
386
387  private boolean closed = false;
388
389  private final ReentrantLock lock = new ReentrantLock();
390
391  /**
392   * A condition variable which is signalled when we finish loading a segment
393   * from the Datanode.
394   */
395  private final Condition finishedLoading = lock.newCondition();
396
397  /**
398   * Information about each Datanode.
399   */
400  private final HashMap<DatanodeInfo, EndpointShmManager> datanodes =
401      new HashMap<DatanodeInfo, EndpointShmManager>(1);
402  
403  /**
404   * The DomainSocketWatcher which keeps track of the UNIX domain socket
405   * associated with each shared memory segment.
406   *
407   * Note: because the DomainSocketWatcher makes callbacks into this
408   * DfsClientShmManager object, you must MUST NOT attempt to take the
409   * DomainSocketWatcher lock while holding the DfsClientShmManager lock,
410   * or else deadlock might result.   This means that most DomainSocketWatcher
411   * methods are off-limits unless you release the manager lock first.
412   */
413  private final DomainSocketWatcher domainSocketWatcher;
414  
415  DfsClientShmManager(int interruptCheckPeriodMs) throws IOException {
416    this.domainSocketWatcher = new DomainSocketWatcher(interruptCheckPeriodMs,
417        "client");
418  }
419  
420  public Slot allocSlot(DatanodeInfo datanode, DomainPeer peer,
421      MutableBoolean usedPeer, ExtendedBlockId blockId,
422      String clientName) throws IOException {
423    lock.lock();
424    try {
425      if (closed) {
426        LOG.trace(this + ": the DfsClientShmManager isclosed.");
427        return null;
428      }
429      EndpointShmManager shmManager = datanodes.get(datanode);
430      if (shmManager == null) {
431        shmManager = new EndpointShmManager(datanode);
432        datanodes.put(datanode, shmManager);
433      }
434      return shmManager.allocSlot(peer, usedPeer, clientName, blockId);
435    } finally {
436      lock.unlock();
437    }
438  }
439  
440  public void freeSlot(Slot slot) {
441    lock.lock();
442    try {
443      DfsClientShm shm = (DfsClientShm)slot.getShm();
444      shm.getEndpointShmManager().freeSlot(slot);
445    } finally {
446      lock.unlock();
447    }
448  }
449
450  @VisibleForTesting
451  public static class PerDatanodeVisitorInfo {
452    public final TreeMap<ShmId, DfsClientShm> full;
453    public final TreeMap<ShmId, DfsClientShm> notFull;
454    public final boolean disabled;
455
456    PerDatanodeVisitorInfo(TreeMap<ShmId, DfsClientShm> full,
457        TreeMap<ShmId, DfsClientShm> notFull, boolean disabled) {
458      this.full = full;
459      this.notFull = notFull;
460      this.disabled = disabled;
461    }
462  }
463
464  @VisibleForTesting
465  public interface Visitor {
466    void visit(HashMap<DatanodeInfo, PerDatanodeVisitorInfo> info)
467        throws IOException;
468  }
469
470  @VisibleForTesting
471  public void visit(Visitor visitor) throws IOException {
472    lock.lock();
473    try {
474      HashMap<DatanodeInfo, PerDatanodeVisitorInfo> info = 
475          new HashMap<DatanodeInfo, PerDatanodeVisitorInfo>();
476      for (Entry<DatanodeInfo, EndpointShmManager> entry :
477            datanodes.entrySet()) {
478        info.put(entry.getKey(), entry.getValue().getVisitorInfo());
479      }
480      visitor.visit(info);
481    } finally {
482      lock.unlock();
483    }
484  }
485
486  /**
487   * Close the DfsClientShmManager.
488   */
489  @Override
490  public void close() throws IOException {
491    lock.lock();
492    try {
493      if (closed) return;
494      closed = true;
495    } finally {
496      lock.unlock();
497    }
498    // When closed, the domainSocketWatcher will issue callbacks that mark
499    // all the outstanding DfsClientShm segments as stale.
500    IOUtils.cleanup(LOG, domainSocketWatcher);
501  }
502
503
504  @Override
505  public String toString() {
506    return String.format("ShortCircuitShmManager(%08x)",
507        System.identityHashCode(this));
508  }
509
510  @VisibleForTesting
511  public DomainSocketWatcher getDomainSocketWatcher() {
512    return domainSocketWatcher;
513  }
514}