001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hdfs.shortcircuit; 019 020import java.io.FileInputStream; 021import java.io.IOException; 022import java.lang.reflect.Field; 023import java.util.BitSet; 024import java.util.Iterator; 025import java.util.NoSuchElementException; 026import java.util.Random; 027 028import org.apache.commons.lang.builder.EqualsBuilder; 029import org.apache.commons.lang.builder.HashCodeBuilder; 030import org.apache.commons.logging.Log; 031import org.apache.commons.logging.LogFactory; 032import org.apache.hadoop.fs.InvalidRequestException; 033import org.apache.hadoop.hdfs.ExtendedBlockId; 034import org.apache.hadoop.io.nativeio.NativeIO; 035import org.apache.hadoop.io.nativeio.NativeIO.POSIX; 036import org.apache.hadoop.util.Shell; 037import org.apache.hadoop.util.StringUtils; 038 039import sun.misc.Unsafe; 040 041import com.google.common.base.Preconditions; 042import com.google.common.collect.ComparisonChain; 043import com.google.common.primitives.Ints; 044 045/** 046 * A shared memory segment used to implement short-circuit reads. 047 */ 048public class ShortCircuitShm { 049 private static final Log LOG = LogFactory.getLog(ShortCircuitShm.class); 050 051 protected static final int BYTES_PER_SLOT = 64; 052 053 private static final Unsafe unsafe = safetyDance(); 054 055 private static Unsafe safetyDance() { 056 try { 057 Field f = Unsafe.class.getDeclaredField("theUnsafe"); 058 f.setAccessible(true); 059 return (Unsafe)f.get(null); 060 } catch (Throwable e) { 061 LOG.error("failed to load misc.Unsafe", e); 062 } 063 return null; 064 } 065 066 /** 067 * Calculate the usable size of a shared memory segment. 068 * We round down to a multiple of the slot size and do some validation. 069 * 070 * @param stream The stream we're using. 071 * @return The usable size of the shared memory segment. 072 */ 073 private static int getUsableLength(FileInputStream stream) 074 throws IOException { 075 int intSize = Ints.checkedCast(stream.getChannel().size()); 076 int slots = intSize / BYTES_PER_SLOT; 077 if (slots == 0) { 078 throw new IOException("size of shared memory segment was " + 079 intSize + ", but that is not enough to hold even one slot."); 080 } 081 return slots * BYTES_PER_SLOT; 082 } 083 084 /** 085 * Identifies a DfsClientShm. 086 */ 087 public static class ShmId implements Comparable<ShmId> { 088 private static final Random random = new Random(); 089 private final long hi; 090 private final long lo; 091 092 /** 093 * Generate a random ShmId. 094 * 095 * We generate ShmIds randomly to prevent a malicious client from 096 * successfully guessing one and using that to interfere with another 097 * client. 098 */ 099 public static ShmId createRandom() { 100 return new ShmId(random.nextLong(), random.nextLong()); 101 } 102 103 public ShmId(long hi, long lo) { 104 this.hi = hi; 105 this.lo = lo; 106 } 107 108 public long getHi() { 109 return hi; 110 } 111 112 public long getLo() { 113 return lo; 114 } 115 116 @Override 117 public boolean equals(Object o) { 118 if ((o == null) || (o.getClass() != this.getClass())) { 119 return false; 120 } 121 ShmId other = (ShmId)o; 122 return new EqualsBuilder(). 123 append(hi, other.hi). 124 append(lo, other.lo). 125 isEquals(); 126 } 127 128 @Override 129 public int hashCode() { 130 return new HashCodeBuilder(). 131 append(this.hi). 132 append(this.lo). 133 toHashCode(); 134 } 135 136 @Override 137 public String toString() { 138 return String.format("%016x%016x", hi, lo); 139 } 140 141 @Override 142 public int compareTo(ShmId other) { 143 return ComparisonChain.start(). 144 compare(hi, other.hi). 145 compare(lo, other.lo). 146 result(); 147 } 148 }; 149 150 /** 151 * Uniquely identifies a slot. 152 */ 153 public static class SlotId { 154 private final ShmId shmId; 155 private final int slotIdx; 156 157 public SlotId(ShmId shmId, int slotIdx) { 158 this.shmId = shmId; 159 this.slotIdx = slotIdx; 160 } 161 162 public ShmId getShmId() { 163 return shmId; 164 } 165 166 public int getSlotIdx() { 167 return slotIdx; 168 } 169 170 @Override 171 public boolean equals(Object o) { 172 if ((o == null) || (o.getClass() != this.getClass())) { 173 return false; 174 } 175 SlotId other = (SlotId)o; 176 return new EqualsBuilder(). 177 append(shmId, other.shmId). 178 append(slotIdx, other.slotIdx). 179 isEquals(); 180 } 181 182 @Override 183 public int hashCode() { 184 return new HashCodeBuilder(). 185 append(this.shmId). 186 append(this.slotIdx). 187 toHashCode(); 188 } 189 190 @Override 191 public String toString() { 192 return String.format("SlotId(%s:%d)", shmId.toString(), slotIdx); 193 } 194 } 195 196 public class SlotIterator implements Iterator<Slot> { 197 int slotIdx = -1; 198 199 @Override 200 public boolean hasNext() { 201 synchronized (ShortCircuitShm.this) { 202 return allocatedSlots.nextSetBit(slotIdx + 1) != -1; 203 } 204 } 205 206 @Override 207 public Slot next() { 208 synchronized (ShortCircuitShm.this) { 209 int nextSlotIdx = allocatedSlots.nextSetBit(slotIdx + 1); 210 if (nextSlotIdx == -1) { 211 throw new NoSuchElementException(); 212 } 213 slotIdx = nextSlotIdx; 214 return slots[nextSlotIdx]; 215 } 216 } 217 218 @Override 219 public void remove() { 220 throw new UnsupportedOperationException("SlotIterator " + 221 "doesn't support removal"); 222 } 223 } 224 225 /** 226 * A slot containing information about a replica. 227 * 228 * The format is: 229 * word 0 230 * bit 0:32 Slot flags (see below). 231 * bit 33:63 Anchor count. 232 * word 1:7 233 * Reserved for future use, such as statistics. 234 * Padding is also useful for avoiding false sharing. 235 * 236 * Little-endian versus big-endian is not relevant here since both the client 237 * and the server reside on the same computer and use the same orientation. 238 */ 239 public class Slot { 240 /** 241 * Flag indicating that the slot is valid. 242 * 243 * The DFSClient sets this flag when it allocates a new slot within one of 244 * its shared memory regions. 245 * 246 * The DataNode clears this flag when the replica associated with this slot 247 * is no longer valid. The client itself also clears this flag when it 248 * believes that the DataNode is no longer using this slot to communicate. 249 */ 250 private static final long VALID_FLAG = 1L<<63; 251 252 /** 253 * Flag indicating that the slot can be anchored. 254 */ 255 private static final long ANCHORABLE_FLAG = 1L<<62; 256 257 /** 258 * The slot address in memory. 259 */ 260 private final long slotAddress; 261 262 /** 263 * BlockId of the block this slot is used for. 264 */ 265 private final ExtendedBlockId blockId; 266 267 Slot(long slotAddress, ExtendedBlockId blockId) { 268 this.slotAddress = slotAddress; 269 this.blockId = blockId; 270 } 271 272 /** 273 * Get the short-circuit memory segment associated with this Slot. 274 * 275 * @return The enclosing short-circuit memory segment. 276 */ 277 public ShortCircuitShm getShm() { 278 return ShortCircuitShm.this; 279 } 280 281 /** 282 * Get the ExtendedBlockId associated with this slot. 283 * 284 * @return The ExtendedBlockId of this slot. 285 */ 286 public ExtendedBlockId getBlockId() { 287 return blockId; 288 } 289 290 /** 291 * Get the SlotId of this slot, containing both shmId and slotIdx. 292 * 293 * @return The SlotId of this slot. 294 */ 295 public SlotId getSlotId() { 296 return new SlotId(getShmId(), getSlotIdx()); 297 } 298 299 /** 300 * Get the Slot index. 301 * 302 * @return The index of this slot. 303 */ 304 public int getSlotIdx() { 305 return Ints.checkedCast( 306 (slotAddress - baseAddress) / BYTES_PER_SLOT); 307 } 308 309 /** 310 * Clear the slot. 311 */ 312 void clear() { 313 unsafe.putLongVolatile(null, this.slotAddress, 0); 314 } 315 316 private boolean isSet(long flag) { 317 long prev = unsafe.getLongVolatile(null, this.slotAddress); 318 return (prev & flag) != 0; 319 } 320 321 private void setFlag(long flag) { 322 long prev; 323 do { 324 prev = unsafe.getLongVolatile(null, this.slotAddress); 325 if ((prev & flag) != 0) { 326 return; 327 } 328 } while (!unsafe.compareAndSwapLong(null, this.slotAddress, 329 prev, prev | flag)); 330 } 331 332 private void clearFlag(long flag) { 333 long prev; 334 do { 335 prev = unsafe.getLongVolatile(null, this.slotAddress); 336 if ((prev & flag) == 0) { 337 return; 338 } 339 } while (!unsafe.compareAndSwapLong(null, this.slotAddress, 340 prev, prev & (~flag))); 341 } 342 343 public boolean isValid() { 344 return isSet(VALID_FLAG); 345 } 346 347 public void makeValid() { 348 setFlag(VALID_FLAG); 349 } 350 351 public void makeInvalid() { 352 clearFlag(VALID_FLAG); 353 } 354 355 public boolean isAnchorable() { 356 return isSet(ANCHORABLE_FLAG); 357 } 358 359 public void makeAnchorable() { 360 setFlag(ANCHORABLE_FLAG); 361 } 362 363 public void makeUnanchorable() { 364 clearFlag(ANCHORABLE_FLAG); 365 } 366 367 public boolean isAnchored() { 368 long prev = unsafe.getLongVolatile(null, this.slotAddress); 369 if ((prev & VALID_FLAG) == 0) { 370 // Slot is no longer valid. 371 return false; 372 } 373 return ((prev & 0x7fffffff) != 0); 374 } 375 376 /** 377 * Try to add an anchor for a given slot. 378 * 379 * When a slot is anchored, we know that the block it refers to is resident 380 * in memory. 381 * 382 * @return True if the slot is anchored. 383 */ 384 public boolean addAnchor() { 385 long prev; 386 do { 387 prev = unsafe.getLongVolatile(null, this.slotAddress); 388 if ((prev & VALID_FLAG) == 0) { 389 // Slot is no longer valid. 390 return false; 391 } 392 if ((prev & ANCHORABLE_FLAG) == 0) { 393 // Slot can't be anchored right now. 394 return false; 395 } 396 if ((prev & 0x7fffffff) == 0x7fffffff) { 397 // Too many other threads have anchored the slot (2 billion?) 398 return false; 399 } 400 } while (!unsafe.compareAndSwapLong(null, this.slotAddress, 401 prev, prev + 1)); 402 return true; 403 } 404 405 /** 406 * Remove an anchor for a given slot. 407 */ 408 public void removeAnchor() { 409 long prev; 410 do { 411 prev = unsafe.getLongVolatile(null, this.slotAddress); 412 Preconditions.checkState((prev & 0x7fffffff) != 0, 413 "Tried to remove anchor for slot " + slotAddress +", which was " + 414 "not anchored."); 415 } while (!unsafe.compareAndSwapLong(null, this.slotAddress, 416 prev, prev - 1)); 417 } 418 419 @Override 420 public String toString() { 421 return "Slot(slotIdx=" + getSlotIdx() + ", shm=" + getShm() + ")"; 422 } 423 } 424 425 /** 426 * ID for this SharedMemorySegment. 427 */ 428 private final ShmId shmId; 429 430 /** 431 * The base address of the memory-mapped file. 432 */ 433 private final long baseAddress; 434 435 /** 436 * The mmapped length of the shared memory segment 437 */ 438 private final int mmappedLength; 439 440 /** 441 * The slots associated with this shared memory segment. 442 * slot[i] contains the slot at offset i * BYTES_PER_SLOT, 443 * or null if that slot is not allocated. 444 */ 445 private final Slot slots[]; 446 447 /** 448 * A bitset where each bit represents a slot which is in use. 449 */ 450 private final BitSet allocatedSlots; 451 452 /** 453 * Create the ShortCircuitShm. 454 * 455 * @param shmId The ID to use. 456 * @param stream The stream that we're going to use to create this 457 * shared memory segment. 458 * 459 * Although this is a FileInputStream, we are going to 460 * assume that the underlying file descriptor is writable 461 * as well as readable. It would be more appropriate to use 462 * a RandomAccessFile here, but that class does not have 463 * any public accessor which returns a FileDescriptor, 464 * unlike FileInputStream. 465 */ 466 public ShortCircuitShm(ShmId shmId, FileInputStream stream) 467 throws IOException { 468 if (!NativeIO.isAvailable()) { 469 throw new UnsupportedOperationException("NativeIO is not available."); 470 } 471 if (Shell.WINDOWS) { 472 throw new UnsupportedOperationException( 473 "DfsClientShm is not yet implemented for Windows."); 474 } 475 if (unsafe == null) { 476 throw new UnsupportedOperationException( 477 "can't use DfsClientShm because we failed to " + 478 "load misc.Unsafe."); 479 } 480 this.shmId = shmId; 481 this.mmappedLength = getUsableLength(stream); 482 this.baseAddress = POSIX.mmap(stream.getFD(), 483 POSIX.MMAP_PROT_READ | POSIX.MMAP_PROT_WRITE, true, mmappedLength); 484 this.slots = new Slot[mmappedLength / BYTES_PER_SLOT]; 485 this.allocatedSlots = new BitSet(slots.length); 486 if (LOG.isTraceEnabled()) { 487 LOG.trace("creating " + this.getClass().getSimpleName() + 488 "(shmId=" + shmId + 489 ", mmappedLength=" + mmappedLength + 490 ", baseAddress=" + String.format("%x", baseAddress) + 491 ", slots.length=" + slots.length + ")"); 492 } 493 } 494 495 public final ShmId getShmId() { 496 return shmId; 497 } 498 499 /** 500 * Determine if this shared memory object is empty. 501 * 502 * @return True if the shared memory object is empty. 503 */ 504 synchronized final public boolean isEmpty() { 505 return allocatedSlots.nextSetBit(0) == -1; 506 } 507 508 /** 509 * Determine if this shared memory object is full. 510 * 511 * @return True if the shared memory object is full. 512 */ 513 synchronized final public boolean isFull() { 514 return allocatedSlots.nextClearBit(0) >= slots.length; 515 } 516 517 /** 518 * Calculate the base address of a slot. 519 * 520 * @param slotIdx Index of the slot. 521 * @return The base address of the slot. 522 */ 523 private final long calculateSlotAddress(int slotIdx) { 524 long offset = slotIdx; 525 offset *= BYTES_PER_SLOT; 526 return this.baseAddress + offset; 527 } 528 529 /** 530 * Allocate a new slot and register it. 531 * 532 * This function chooses an empty slot, initializes it, and then returns 533 * the relevant Slot object. 534 * 535 * @return The new slot. 536 */ 537 synchronized public final Slot allocAndRegisterSlot( 538 ExtendedBlockId blockId) { 539 int idx = allocatedSlots.nextClearBit(0); 540 if (idx >= slots.length) { 541 throw new RuntimeException(this + ": no more slots are available."); 542 } 543 allocatedSlots.set(idx, true); 544 Slot slot = new Slot(calculateSlotAddress(idx), blockId); 545 slot.clear(); 546 slot.makeValid(); 547 slots[idx] = slot; 548 if (LOG.isTraceEnabled()) { 549 LOG.trace(this + ": allocAndRegisterSlot " + idx + ": allocatedSlots=" + allocatedSlots + 550 StringUtils.getStackTrace(Thread.currentThread())); 551 } 552 return slot; 553 } 554 555 synchronized public final Slot getSlot(int slotIdx) 556 throws InvalidRequestException { 557 if (!allocatedSlots.get(slotIdx)) { 558 throw new InvalidRequestException(this + ": slot " + slotIdx + 559 " does not exist."); 560 } 561 return slots[slotIdx]; 562 } 563 564 /** 565 * Register a slot. 566 * 567 * This function looks at a slot which has already been initialized (by 568 * another process), and registers it with us. Then, it returns the 569 * relevant Slot object. 570 * 571 * @return The slot. 572 * 573 * @throws InvalidRequestException 574 * If the slot index we're trying to allocate has not been 575 * initialized, or is already in use. 576 */ 577 synchronized public final Slot registerSlot(int slotIdx, 578 ExtendedBlockId blockId) throws InvalidRequestException { 579 if (slotIdx < 0) { 580 throw new InvalidRequestException(this + ": invalid negative slot " + 581 "index " + slotIdx); 582 } 583 if (slotIdx >= slots.length) { 584 throw new InvalidRequestException(this + ": invalid slot " + 585 "index " + slotIdx); 586 } 587 if (allocatedSlots.get(slotIdx)) { 588 throw new InvalidRequestException(this + ": slot " + slotIdx + 589 " is already in use."); 590 } 591 Slot slot = new Slot(calculateSlotAddress(slotIdx), blockId); 592 if (!slot.isValid()) { 593 throw new InvalidRequestException(this + ": slot " + slotIdx + 594 " is not marked as valid."); 595 } 596 slots[slotIdx] = slot; 597 allocatedSlots.set(slotIdx, true); 598 if (LOG.isTraceEnabled()) { 599 LOG.trace(this + ": registerSlot " + slotIdx + ": allocatedSlots=" + allocatedSlots + 600 StringUtils.getStackTrace(Thread.currentThread())); 601 } 602 return slot; 603 } 604 605 /** 606 * Unregisters a slot. 607 * 608 * This doesn't alter the contents of the slot. It just means 609 * 610 * @param slotIdx Index of the slot to unregister. 611 */ 612 synchronized public final void unregisterSlot(int slotIdx) { 613 Preconditions.checkState(allocatedSlots.get(slotIdx), 614 "tried to unregister slot " + slotIdx + ", which was not registered."); 615 allocatedSlots.set(slotIdx, false); 616 slots[slotIdx] = null; 617 if (LOG.isTraceEnabled()) { 618 LOG.trace(this + ": unregisterSlot " + slotIdx); 619 } 620 } 621 622 /** 623 * Iterate over all allocated slots. 624 * 625 * Note that this method isn't safe if 626 * 627 * @return The slot iterator. 628 */ 629 public SlotIterator slotIterator() { 630 return new SlotIterator(); 631 } 632 633 public void free() { 634 try { 635 POSIX.munmap(baseAddress, mmappedLength); 636 } catch (IOException e) { 637 LOG.warn(this + ": failed to munmap", e); 638 } 639 LOG.trace(this + ": freed"); 640 } 641 642 @Override 643 public String toString() { 644 return this.getClass().getSimpleName() + "(" + shmId + ")"; 645 } 646}