001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hdfs.server.datanode; 019 020import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_SHARED_FILE_DESCRIPTOR_PATHS; 021import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_SHARED_FILE_DESCRIPTOR_PATHS_DEFAULT; 022import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS; 023import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS_DEFAULT; 024 025import java.io.Closeable; 026import java.io.FileInputStream; 027import java.io.IOException; 028import java.util.HashMap; 029import java.util.HashSet; 030import java.util.Iterator; 031import java.util.Set; 032 033import com.google.common.annotations.VisibleForTesting; 034import org.apache.commons.io.IOUtils; 035import org.apache.commons.logging.Log; 036import org.apache.commons.logging.LogFactory; 037import org.apache.hadoop.conf.Configuration; 038import org.apache.hadoop.fs.InvalidRequestException; 039import org.apache.hadoop.hdfs.ExtendedBlockId; 040import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm; 041import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.ShmId; 042import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.Slot; 043import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.SlotId; 044import org.apache.hadoop.io.nativeio.SharedFileDescriptorFactory; 045import org.apache.hadoop.net.unix.DomainSocket; 046import org.apache.hadoop.net.unix.DomainSocketWatcher; 047 048import com.google.common.base.Joiner; 049import com.google.common.base.Preconditions; 050import com.google.common.collect.HashMultimap; 051 052/** 053 * Manages client short-circuit memory segments on the DataNode. 054 * 055 * DFSClients request shared memory segments from the DataNode. The 056 * ShortCircuitRegistry generates and manages these segments. Each segment 057 * has a randomly generated 128-bit ID which uniquely identifies it. The 058 * segments each contain several "slots." 059 * 060 * Before performing a short-circuit read, DFSClients must request a pair of 061 * file descriptors from the DataNode via the REQUEST_SHORT_CIRCUIT_FDS 062 * operation. As part of this operation, DFSClients pass the ID of the shared 063 * memory segment they would like to use to communicate information about this 064 * replica, as well as the slot number within that segment they would like to 065 * use. Slot allocation is always done by the client. 066 * 067 * Slots are used to track the state of the block on the both the client and 068 * datanode. When this DataNode mlocks a block, the corresponding slots for the 069 * replicas are marked as "anchorable". Anchorable blocks can be safely read 070 * without verifying the checksum. This means that BlockReaderLocal objects 071 * using these replicas can skip checksumming. It also means that we can do 072 * zero-copy reads on these replicas (the ZCR interface has no way of 073 * verifying checksums.) 074 * 075 * When a DN needs to munlock a block, it needs to first wait for the block to 076 * be unanchored by clients doing a no-checksum read or a zero-copy read. The 077 * DN also marks the block's slots as "unanchorable" to prevent additional 078 * clients from initiating these operations in the future. 079 * 080 * The counterpart of this class on the client is {@link DfsClientShmManager}. 081 */ 082public class ShortCircuitRegistry { 083 public static final Log LOG = LogFactory.getLog(ShortCircuitRegistry.class); 084 085 private static final int SHM_LENGTH = 8192; 086 087 public static class RegisteredShm extends ShortCircuitShm 088 implements DomainSocketWatcher.Handler { 089 private final String clientName; 090 private final ShortCircuitRegistry registry; 091 092 RegisteredShm(String clientName, ShmId shmId, FileInputStream stream, 093 ShortCircuitRegistry registry) throws IOException { 094 super(shmId, stream); 095 this.clientName = clientName; 096 this.registry = registry; 097 } 098 099 @Override 100 public boolean handle(DomainSocket sock) { 101 synchronized (registry) { 102 synchronized (this) { 103 registry.removeShm(this); 104 } 105 } 106 return true; 107 } 108 109 String getClientName() { 110 return clientName; 111 } 112 } 113 114 public synchronized void removeShm(ShortCircuitShm shm) { 115 if (LOG.isTraceEnabled()) { 116 LOG.debug("removing shm " + shm); 117 } 118 // Stop tracking the shmId. 119 RegisteredShm removedShm = segments.remove(shm.getShmId()); 120 Preconditions.checkState(removedShm == shm, 121 "failed to remove " + shm.getShmId()); 122 // Stop tracking the slots. 123 for (Iterator<Slot> iter = shm.slotIterator(); iter.hasNext(); ) { 124 Slot slot = iter.next(); 125 boolean removed = slots.remove(slot.getBlockId(), slot); 126 Preconditions.checkState(removed); 127 slot.makeInvalid(); 128 } 129 // De-allocate the memory map and close the shared file. 130 shm.free(); 131 } 132 133 /** 134 * Whether or not the registry is enabled. 135 */ 136 private boolean enabled; 137 138 /** 139 * The factory which creates shared file descriptors. 140 */ 141 private final SharedFileDescriptorFactory shmFactory; 142 143 /** 144 * A watcher which sends out callbacks when the UNIX domain socket 145 * associated with a shared memory segment closes. 146 */ 147 private final DomainSocketWatcher watcher; 148 149 private final HashMap<ShmId, RegisteredShm> segments = 150 new HashMap<ShmId, RegisteredShm>(0); 151 152 private final HashMultimap<ExtendedBlockId, Slot> slots = 153 HashMultimap.create(0, 1); 154 155 public ShortCircuitRegistry(Configuration conf) throws IOException { 156 boolean enabled = false; 157 SharedFileDescriptorFactory shmFactory = null; 158 DomainSocketWatcher watcher = null; 159 try { 160 int interruptCheck = conf.getInt( 161 DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS, 162 DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS_DEFAULT); 163 if (interruptCheck <= 0) { 164 throw new IOException( 165 DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS + 166 " was set to " + interruptCheck); 167 } 168 String shmPaths[] = 169 conf.getTrimmedStrings(DFS_DATANODE_SHARED_FILE_DESCRIPTOR_PATHS); 170 if (shmPaths.length == 0) { 171 shmPaths = 172 DFS_DATANODE_SHARED_FILE_DESCRIPTOR_PATHS_DEFAULT.split(","); 173 } 174 shmFactory = SharedFileDescriptorFactory. 175 create("HadoopShortCircuitShm_", shmPaths); 176 String dswLoadingFailure = DomainSocketWatcher.getLoadingFailureReason(); 177 if (dswLoadingFailure != null) { 178 throw new IOException(dswLoadingFailure); 179 } 180 watcher = new DomainSocketWatcher(interruptCheck, "datanode"); 181 enabled = true; 182 if (LOG.isDebugEnabled()) { 183 LOG.debug("created new ShortCircuitRegistry with interruptCheck=" + 184 interruptCheck + ", shmPath=" + shmFactory.getPath()); 185 } 186 } catch (IOException e) { 187 if (LOG.isDebugEnabled()) { 188 LOG.debug("Disabling ShortCircuitRegistry", e); 189 } 190 } finally { 191 this.enabled = enabled; 192 this.shmFactory = shmFactory; 193 this.watcher = watcher; 194 } 195 } 196 197 /** 198 * Process a block mlock event from the FsDatasetCache. 199 * 200 * @param blockId The block that was mlocked. 201 */ 202 public synchronized void processBlockMlockEvent(ExtendedBlockId blockId) { 203 if (!enabled) return; 204 Set<Slot> affectedSlots = slots.get(blockId); 205 for (Slot slot : affectedSlots) { 206 slot.makeAnchorable(); 207 } 208 } 209 210 /** 211 * Mark any slots associated with this blockId as unanchorable. 212 * 213 * @param blockId The block ID. 214 * @return True if we should allow the munlock request. 215 */ 216 public synchronized boolean processBlockMunlockRequest( 217 ExtendedBlockId blockId) { 218 if (!enabled) return true; 219 boolean allowMunlock = true; 220 Set<Slot> affectedSlots = slots.get(blockId); 221 for (Slot slot : affectedSlots) { 222 slot.makeUnanchorable(); 223 if (slot.isAnchored()) { 224 allowMunlock = false; 225 } 226 } 227 return allowMunlock; 228 } 229 230 /** 231 * Invalidate any slot associated with a blockId that we are invalidating 232 * (deleting) from this DataNode. When a slot is invalid, the DFSClient will 233 * not use the corresponding replica for new read or mmap operations (although 234 * existing, ongoing read or mmap operations will complete.) 235 * 236 * @param blockId The block ID. 237 */ 238 public synchronized void processBlockInvalidation(ExtendedBlockId blockId) { 239 if (!enabled) return; 240 final Set<Slot> affectedSlots = slots.get(blockId); 241 if (!affectedSlots.isEmpty()) { 242 final StringBuilder bld = new StringBuilder(); 243 String prefix = ""; 244 bld.append("Block ").append(blockId).append(" has been invalidated. "). 245 append("Marking short-circuit slots as invalid: "); 246 for (Slot slot : affectedSlots) { 247 slot.makeInvalid(); 248 bld.append(prefix).append(slot.toString()); 249 prefix = ", "; 250 } 251 LOG.info(bld.toString()); 252 } 253 } 254 255 public synchronized String getClientNames(ExtendedBlockId blockId) { 256 if (!enabled) return ""; 257 final HashSet<String> clientNames = new HashSet<String>(); 258 final Set<Slot> affectedSlots = slots.get(blockId); 259 for (Slot slot : affectedSlots) { 260 clientNames.add(((RegisteredShm)slot.getShm()).getClientName()); 261 } 262 return Joiner.on(",").join(clientNames); 263 } 264 265 public static class NewShmInfo implements Closeable { 266 public final ShmId shmId; 267 public final FileInputStream stream; 268 269 NewShmInfo(ShmId shmId, FileInputStream stream) { 270 this.shmId = shmId; 271 this.stream = stream; 272 } 273 274 @Override 275 public void close() throws IOException { 276 stream.close(); 277 } 278 } 279 280 /** 281 * Handle a DFSClient request to create a new memory segment. 282 * 283 * @param clientName Client name as reported by the client. 284 * @param sock The DomainSocket to associate with this memory 285 * segment. When this socket is closed, or the 286 * other side writes anything to the socket, the 287 * segment will be closed. This can happen at any 288 * time, including right after this function returns. 289 * @return A NewShmInfo object. The caller must close the 290 * NewShmInfo object once they are done with it. 291 * @throws IOException If the new memory segment could not be created. 292 */ 293 public NewShmInfo createNewMemorySegment(String clientName, 294 DomainSocket sock) throws IOException { 295 NewShmInfo info = null; 296 RegisteredShm shm = null; 297 ShmId shmId = null; 298 synchronized (this) { 299 if (!enabled) { 300 if (LOG.isTraceEnabled()) { 301 LOG.trace("createNewMemorySegment: ShortCircuitRegistry is " + 302 "not enabled."); 303 } 304 throw new UnsupportedOperationException(); 305 } 306 FileInputStream fis = null; 307 try { 308 do { 309 shmId = ShmId.createRandom(); 310 } while (segments.containsKey(shmId)); 311 fis = shmFactory.createDescriptor(clientName, SHM_LENGTH); 312 shm = new RegisteredShm(clientName, shmId, fis, this); 313 } finally { 314 if (shm == null) { 315 IOUtils.closeQuietly(fis); 316 } 317 } 318 info = new NewShmInfo(shmId, fis); 319 segments.put(shmId, shm); 320 } 321 // Drop the registry lock to prevent deadlock. 322 // After this point, RegisteredShm#handle may be called at any time. 323 watcher.add(sock, shm); 324 if (LOG.isTraceEnabled()) { 325 LOG.trace("createNewMemorySegment: created " + info.shmId); 326 } 327 return info; 328 } 329 330 public synchronized void registerSlot(ExtendedBlockId blockId, SlotId slotId, 331 boolean isCached) throws InvalidRequestException { 332 if (!enabled) { 333 if (LOG.isTraceEnabled()) { 334 LOG.trace(this + " can't register a slot because the " + 335 "ShortCircuitRegistry is not enabled."); 336 } 337 throw new UnsupportedOperationException(); 338 } 339 ShmId shmId = slotId.getShmId(); 340 RegisteredShm shm = segments.get(shmId); 341 if (shm == null) { 342 throw new InvalidRequestException("there is no shared memory segment " + 343 "registered with shmId " + shmId); 344 } 345 Slot slot = shm.registerSlot(slotId.getSlotIdx(), blockId); 346 if (isCached) { 347 slot.makeAnchorable(); 348 } else { 349 slot.makeUnanchorable(); 350 } 351 boolean added = slots.put(blockId, slot); 352 Preconditions.checkState(added); 353 if (LOG.isTraceEnabled()) { 354 LOG.trace(this + ": registered " + blockId + " with slot " + 355 slotId + " (isCached=" + isCached + ")"); 356 } 357 } 358 359 public synchronized void unregisterSlot(SlotId slotId) 360 throws InvalidRequestException { 361 if (!enabled) { 362 if (LOG.isTraceEnabled()) { 363 LOG.trace("unregisterSlot: ShortCircuitRegistry is " + 364 "not enabled."); 365 } 366 throw new UnsupportedOperationException(); 367 } 368 ShmId shmId = slotId.getShmId(); 369 RegisteredShm shm = segments.get(shmId); 370 if (shm == null) { 371 throw new InvalidRequestException("there is no shared memory segment " + 372 "registered with shmId " + shmId); 373 } 374 Slot slot = shm.getSlot(slotId.getSlotIdx()); 375 slot.makeInvalid(); 376 shm.unregisterSlot(slotId.getSlotIdx()); 377 slots.remove(slot.getBlockId(), slot); 378 } 379 380 public void shutdown() { 381 synchronized (this) { 382 if (!enabled) return; 383 enabled = false; 384 } 385 IOUtils.closeQuietly(watcher); 386 } 387 388 public static interface Visitor { 389 void accept(HashMap<ShmId, RegisteredShm> segments, 390 HashMultimap<ExtendedBlockId, Slot> slots); 391 } 392 393 @VisibleForTesting 394 public synchronized void visit(Visitor visitor) { 395 visitor.accept(segments, slots); 396 } 397}