001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hdfs.shortcircuit; 019 020import java.io.BufferedOutputStream; 021import java.io.Closeable; 022import java.io.DataOutputStream; 023import java.io.EOFException; 024import java.io.FileInputStream; 025import java.io.IOException; 026import java.util.HashMap; 027import java.util.Map.Entry; 028import java.util.TreeMap; 029import java.util.concurrent.locks.Condition; 030import java.util.concurrent.locks.ReentrantLock; 031 032import org.apache.commons.lang.mutable.MutableBoolean; 033import org.apache.commons.logging.Log; 034import org.apache.commons.logging.LogFactory; 035import org.apache.hadoop.classification.InterfaceAudience; 036import org.apache.hadoop.hdfs.ExtendedBlockId; 037import org.apache.hadoop.hdfs.net.DomainPeer; 038import org.apache.hadoop.hdfs.protocol.DatanodeInfo; 039import org.apache.hadoop.hdfs.protocol.datatransfer.DataTransferProtocol; 040import org.apache.hadoop.hdfs.protocol.datatransfer.Sender; 041import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.ShortCircuitShmResponseProto; 042import org.apache.hadoop.hdfs.protocolPB.PBHelper; 043import org.apache.hadoop.hdfs.server.datanode.ShortCircuitRegistry; 044import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.ShmId; 045import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.Slot; 046import org.apache.hadoop.io.IOUtils; 047import org.apache.hadoop.net.unix.DomainSocket; 048import org.apache.hadoop.net.unix.DomainSocketWatcher; 049 050import com.google.common.annotations.VisibleForTesting; 051import com.google.common.base.Preconditions; 052 053/** 054 * Manages short-circuit memory segments for an HDFS client. 055 * 056 * Clients are responsible for requesting and releasing shared memory segments used 057 * for communicating with the DataNode. The client will try to allocate new slots 058 * in the set of existing segments, falling back to getting a new segment from the 059 * DataNode via {@link DataTransferProtocol#requestShortCircuitFds}. 060 * 061 * The counterpart to this class on the DataNode is {@link ShortCircuitRegistry}. 062 * See {@link ShortCircuitRegistry} for more information on the communication protocol. 063 */ 064@InterfaceAudience.Private 065public class DfsClientShmManager implements Closeable { 066 private static final Log LOG = LogFactory.getLog(DfsClientShmManager.class); 067 068 /** 069 * Manages short-circuit memory segments that pertain to a given DataNode. 070 */ 071 class EndpointShmManager { 072 /** 073 * The datanode we're managing. 074 */ 075 private final DatanodeInfo datanode; 076 077 /** 078 * Shared memory segments which have no empty slots. 079 * 080 * Protected by the manager lock. 081 */ 082 private final TreeMap<ShmId, DfsClientShm> full = 083 new TreeMap<ShmId, DfsClientShm>(); 084 085 /** 086 * Shared memory segments which have at least one empty slot. 087 * 088 * Protected by the manager lock. 089 */ 090 private final TreeMap<ShmId, DfsClientShm> notFull = 091 new TreeMap<ShmId, DfsClientShm>(); 092 093 /** 094 * True if this datanode doesn't support short-circuit shared memory 095 * segments. 096 * 097 * Protected by the manager lock. 098 */ 099 private boolean disabled = false; 100 101 /** 102 * True if we're in the process of loading a shared memory segment from 103 * this DataNode. 104 * 105 * Protected by the manager lock. 106 */ 107 private boolean loading = false; 108 109 EndpointShmManager (DatanodeInfo datanode) { 110 this.datanode = datanode; 111 } 112 113 /** 114 * Pull a slot out of a preexisting shared memory segment. 115 * 116 * Must be called with the manager lock held. 117 * 118 * @param blockId The blockId to put inside the Slot object. 119 * 120 * @return null if none of our shared memory segments contain a 121 * free slot; the slot object otherwise. 122 */ 123 private Slot allocSlotFromExistingShm(ExtendedBlockId blockId) { 124 if (notFull.isEmpty()) { 125 return null; 126 } 127 Entry<ShmId, DfsClientShm> entry = notFull.firstEntry(); 128 DfsClientShm shm = entry.getValue(); 129 ShmId shmId = shm.getShmId(); 130 Slot slot = shm.allocAndRegisterSlot(blockId); 131 if (shm.isFull()) { 132 if (LOG.isTraceEnabled()) { 133 LOG.trace(this + ": pulled the last slot " + slot.getSlotIdx() + 134 " out of " + shm); 135 } 136 DfsClientShm removedShm = notFull.remove(shmId); 137 Preconditions.checkState(removedShm == shm); 138 full.put(shmId, shm); 139 } else { 140 if (LOG.isTraceEnabled()) { 141 LOG.trace(this + ": pulled slot " + slot.getSlotIdx() + 142 " out of " + shm); 143 } 144 } 145 return slot; 146 } 147 148 /** 149 * Ask the DataNode for a new shared memory segment. This function must be 150 * called with the manager lock held. We will release the lock while 151 * communicating with the DataNode. 152 * 153 * @param clientName The current client name. 154 * @param peer The peer to use to talk to the DataNode. 155 * 156 * @return Null if the DataNode does not support shared memory 157 * segments, or experienced an error creating the 158 * shm. The shared memory segment itself on success. 159 * @throws IOException If there was an error communicating over the socket. 160 * We will not throw an IOException unless the socket 161 * itself (or the network) is the problem. 162 */ 163 private DfsClientShm requestNewShm(String clientName, DomainPeer peer) 164 throws IOException { 165 final DataOutputStream out = 166 new DataOutputStream( 167 new BufferedOutputStream(peer.getOutputStream())); 168 new Sender(out).requestShortCircuitShm(clientName); 169 ShortCircuitShmResponseProto resp = 170 ShortCircuitShmResponseProto.parseFrom( 171 PBHelper.vintPrefixed(peer.getInputStream())); 172 String error = resp.hasError() ? resp.getError() : "(unknown)"; 173 switch (resp.getStatus()) { 174 case SUCCESS: 175 DomainSocket sock = peer.getDomainSocket(); 176 byte buf[] = new byte[1]; 177 FileInputStream fis[] = new FileInputStream[1]; 178 if (sock.recvFileInputStreams(fis, buf, 0, buf.length) < 0) { 179 throw new EOFException("got EOF while trying to transfer the " + 180 "file descriptor for the shared memory segment."); 181 } 182 if (fis[0] == null) { 183 throw new IOException("the datanode " + datanode + " failed to " + 184 "pass a file descriptor for the shared memory segment."); 185 } 186 try { 187 DfsClientShm shm = 188 new DfsClientShm(PBHelper.convert(resp.getId()), 189 fis[0], this, peer); 190 if (LOG.isTraceEnabled()) { 191 LOG.trace(this + ": createNewShm: created " + shm); 192 } 193 return shm; 194 } finally { 195 IOUtils.cleanup(LOG, fis[0]); 196 } 197 case ERROR_UNSUPPORTED: 198 // The DataNode just does not support short-circuit shared memory 199 // access, and we should stop asking. 200 LOG.info(this + ": datanode does not support short-circuit " + 201 "shared memory access: " + error); 202 disabled = true; 203 return null; 204 default: 205 // The datanode experienced some kind of unexpected error when trying to 206 // create the short-circuit shared memory segment. 207 LOG.warn(this + ": error requesting short-circuit shared memory " + 208 "access: " + error); 209 return null; 210 } 211 } 212 213 /** 214 * Allocate a new shared memory slot connected to this datanode. 215 * 216 * Must be called with the EndpointShmManager lock held. 217 * 218 * @param peer The peer to use to talk to the DataNode. 219 * @param usedPeer (out param) Will be set to true if we used the peer. 220 * When a peer is used 221 * 222 * @param clientName The client name. 223 * @param blockId The block ID to use. 224 * @return null if the DataNode does not support shared memory 225 * segments, or experienced an error creating the 226 * shm. The shared memory segment itself on success. 227 * @throws IOException If there was an error communicating over the socket. 228 */ 229 Slot allocSlot(DomainPeer peer, MutableBoolean usedPeer, 230 String clientName, ExtendedBlockId blockId) throws IOException { 231 while (true) { 232 if (closed) { 233 if (LOG.isTraceEnabled()) { 234 LOG.trace(this + ": the DfsClientShmManager has been closed."); 235 } 236 return null; 237 } 238 if (disabled) { 239 if (LOG.isTraceEnabled()) { 240 LOG.trace(this + ": shared memory segment access is disabled."); 241 } 242 return null; 243 } 244 // Try to use an existing slot. 245 Slot slot = allocSlotFromExistingShm(blockId); 246 if (slot != null) { 247 return slot; 248 } 249 // There are no free slots. If someone is loading more slots, wait 250 // for that to finish. 251 if (loading) { 252 if (LOG.isTraceEnabled()) { 253 LOG.trace(this + ": waiting for loading to finish..."); 254 } 255 finishedLoading.awaitUninterruptibly(); 256 } else { 257 // Otherwise, load the slot ourselves. 258 loading = true; 259 lock.unlock(); 260 DfsClientShm shm; 261 try { 262 shm = requestNewShm(clientName, peer); 263 if (shm == null) continue; 264 // See #{DfsClientShmManager#domainSocketWatcher} for details 265 // about why we do this before retaking the manager lock. 266 domainSocketWatcher.add(peer.getDomainSocket(), shm); 267 // The DomainPeer is now our responsibility, and should not be 268 // closed by the caller. 269 usedPeer.setValue(true); 270 } finally { 271 lock.lock(); 272 loading = false; 273 finishedLoading.signalAll(); 274 } 275 if (shm.isDisconnected()) { 276 // If the peer closed immediately after the shared memory segment 277 // was created, the DomainSocketWatcher callback might already have 278 // fired and marked the shm as disconnected. In this case, we 279 // obviously don't want to add the SharedMemorySegment to our list 280 // of valid not-full segments. 281 if (LOG.isDebugEnabled()) { 282 LOG.debug(this + ": the UNIX domain socket associated with " + 283 "this short-circuit memory closed before we could make " + 284 "use of the shm."); 285 } 286 } else { 287 notFull.put(shm.getShmId(), shm); 288 } 289 } 290 } 291 } 292 293 /** 294 * Stop tracking a slot. 295 * 296 * Must be called with the EndpointShmManager lock held. 297 * 298 * @param slot The slot to release. 299 */ 300 void freeSlot(Slot slot) { 301 DfsClientShm shm = (DfsClientShm)slot.getShm(); 302 shm.unregisterSlot(slot.getSlotIdx()); 303 if (shm.isDisconnected()) { 304 // Stale shared memory segments should not be tracked here. 305 Preconditions.checkState(!full.containsKey(shm.getShmId())); 306 Preconditions.checkState(!notFull.containsKey(shm.getShmId())); 307 if (shm.isEmpty()) { 308 if (LOG.isTraceEnabled()) { 309 LOG.trace(this + ": freeing empty stale " + shm); 310 } 311 shm.free(); 312 } 313 } else { 314 ShmId shmId = shm.getShmId(); 315 full.remove(shmId); // The shm can't be full if we just freed a slot. 316 if (shm.isEmpty()) { 317 notFull.remove(shmId); 318 319 // If the shared memory segment is now empty, we call shutdown(2) on 320 // the UNIX domain socket associated with it. The DomainSocketWatcher, 321 // which is watching this socket, will call DfsClientShm#handle, 322 // cleaning up this shared memory segment. 323 // 324 // See #{DfsClientShmManager#domainSocketWatcher} for details about why 325 // we don't want to call DomainSocketWatcher#remove directly here. 326 // 327 // Note that we could experience 'fragmentation' here, where the 328 // DFSClient allocates a bunch of slots in different shared memory 329 // segments, and then frees most of them, but never fully empties out 330 // any segment. We make some attempt to avoid this fragmentation by 331 // always allocating new slots out of the shared memory segment with the 332 // lowest ID, but it could still occur. In most workloads, 333 // fragmentation should not be a major concern, since it doesn't impact 334 // peak file descriptor usage or the speed of allocation. 335 if (LOG.isTraceEnabled()) { 336 LOG.trace(this + ": shutting down UNIX domain socket for " + 337 "empty " + shm); 338 } 339 shutdown(shm); 340 } else { 341 notFull.put(shmId, shm); 342 } 343 } 344 } 345 346 /** 347 * Unregister a shared memory segment. 348 * 349 * Once a segment is unregistered, we will not allocate any more slots 350 * inside that segment. 351 * 352 * The DomainSocketWatcher calls this while holding the DomainSocketWatcher 353 * lock. 354 * 355 * @param shmId The ID of the shared memory segment to unregister. 356 */ 357 void unregisterShm(ShmId shmId) { 358 lock.lock(); 359 try { 360 full.remove(shmId); 361 notFull.remove(shmId); 362 } finally { 363 lock.unlock(); 364 } 365 } 366 367 @Override 368 public String toString() { 369 return String.format("EndpointShmManager(%s, parent=%s)", 370 datanode, DfsClientShmManager.this); 371 } 372 373 PerDatanodeVisitorInfo getVisitorInfo() { 374 return new PerDatanodeVisitorInfo(full, notFull, disabled); 375 } 376 377 final void shutdown(DfsClientShm shm) { 378 try { 379 shm.getPeer().getDomainSocket().shutdown(); 380 } catch (IOException e) { 381 LOG.warn(this + ": error shutting down shm: got IOException calling " + 382 "shutdown(SHUT_RDWR)", e); 383 } 384 } 385 } 386 387 private boolean closed = false; 388 389 private final ReentrantLock lock = new ReentrantLock(); 390 391 /** 392 * A condition variable which is signalled when we finish loading a segment 393 * from the Datanode. 394 */ 395 private final Condition finishedLoading = lock.newCondition(); 396 397 /** 398 * Information about each Datanode. 399 */ 400 private final HashMap<DatanodeInfo, EndpointShmManager> datanodes = 401 new HashMap<DatanodeInfo, EndpointShmManager>(1); 402 403 /** 404 * The DomainSocketWatcher which keeps track of the UNIX domain socket 405 * associated with each shared memory segment. 406 * 407 * Note: because the DomainSocketWatcher makes callbacks into this 408 * DfsClientShmManager object, you must MUST NOT attempt to take the 409 * DomainSocketWatcher lock while holding the DfsClientShmManager lock, 410 * or else deadlock might result. This means that most DomainSocketWatcher 411 * methods are off-limits unless you release the manager lock first. 412 */ 413 private final DomainSocketWatcher domainSocketWatcher; 414 415 DfsClientShmManager(int interruptCheckPeriodMs) throws IOException { 416 this.domainSocketWatcher = new DomainSocketWatcher(interruptCheckPeriodMs, 417 "client"); 418 } 419 420 public Slot allocSlot(DatanodeInfo datanode, DomainPeer peer, 421 MutableBoolean usedPeer, ExtendedBlockId blockId, 422 String clientName) throws IOException { 423 lock.lock(); 424 try { 425 if (closed) { 426 LOG.trace(this + ": the DfsClientShmManager isclosed."); 427 return null; 428 } 429 EndpointShmManager shmManager = datanodes.get(datanode); 430 if (shmManager == null) { 431 shmManager = new EndpointShmManager(datanode); 432 datanodes.put(datanode, shmManager); 433 } 434 return shmManager.allocSlot(peer, usedPeer, clientName, blockId); 435 } finally { 436 lock.unlock(); 437 } 438 } 439 440 public void freeSlot(Slot slot) { 441 lock.lock(); 442 try { 443 DfsClientShm shm = (DfsClientShm)slot.getShm(); 444 shm.getEndpointShmManager().freeSlot(slot); 445 } finally { 446 lock.unlock(); 447 } 448 } 449 450 @VisibleForTesting 451 public static class PerDatanodeVisitorInfo { 452 public final TreeMap<ShmId, DfsClientShm> full; 453 public final TreeMap<ShmId, DfsClientShm> notFull; 454 public final boolean disabled; 455 456 PerDatanodeVisitorInfo(TreeMap<ShmId, DfsClientShm> full, 457 TreeMap<ShmId, DfsClientShm> notFull, boolean disabled) { 458 this.full = full; 459 this.notFull = notFull; 460 this.disabled = disabled; 461 } 462 } 463 464 @VisibleForTesting 465 public interface Visitor { 466 void visit(HashMap<DatanodeInfo, PerDatanodeVisitorInfo> info) 467 throws IOException; 468 } 469 470 @VisibleForTesting 471 public void visit(Visitor visitor) throws IOException { 472 lock.lock(); 473 try { 474 HashMap<DatanodeInfo, PerDatanodeVisitorInfo> info = 475 new HashMap<DatanodeInfo, PerDatanodeVisitorInfo>(); 476 for (Entry<DatanodeInfo, EndpointShmManager> entry : 477 datanodes.entrySet()) { 478 info.put(entry.getKey(), entry.getValue().getVisitorInfo()); 479 } 480 visitor.visit(info); 481 } finally { 482 lock.unlock(); 483 } 484 } 485 486 /** 487 * Close the DfsClientShmManager. 488 */ 489 @Override 490 public void close() throws IOException { 491 lock.lock(); 492 try { 493 if (closed) return; 494 closed = true; 495 } finally { 496 lock.unlock(); 497 } 498 // When closed, the domainSocketWatcher will issue callbacks that mark 499 // all the outstanding DfsClientShm segments as stale. 500 IOUtils.cleanup(LOG, domainSocketWatcher); 501 } 502 503 504 @Override 505 public String toString() { 506 return String.format("ShortCircuitShmManager(%08x)", 507 System.identityHashCode(this)); 508 } 509 510 @VisibleForTesting 511 public DomainSocketWatcher getDomainSocketWatcher() { 512 return domainSocketWatcher; 513 } 514}