001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hdfs.protocol; 019 020import java.io.IOException; 021import java.util.ArrayList; 022import java.util.Collection; 023import java.util.Collections; 024import java.util.Iterator; 025import java.util.List; 026 027import org.apache.hadoop.classification.InterfaceAudience; 028import org.apache.hadoop.classification.InterfaceStability; 029import org.apache.hadoop.hdfs.protocol.BlockListAsLongs.BlockReportReplica; 030import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState; 031import org.apache.hadoop.hdfs.server.datanode.Replica; 032import com.google.common.base.Preconditions; 033import com.google.protobuf.ByteString; 034import com.google.protobuf.CodedInputStream; 035import com.google.protobuf.CodedOutputStream; 036 037@InterfaceAudience.Private 038@InterfaceStability.Evolving 039public abstract class BlockListAsLongs implements Iterable<BlockReportReplica> { 040 private final static int CHUNK_SIZE = 64*1024; // 64K 041 private static long[] EMPTY_LONGS = new long[]{0, 0}; 042 043 public static BlockListAsLongs EMPTY = new BlockListAsLongs() { 044 @Override 045 public int getNumberOfBlocks() { 046 return 0; 047 } 048 @Override 049 public ByteString getBlocksBuffer() { 050 return ByteString.EMPTY; 051 } 052 @Override 053 public long[] getBlockListAsLongs() { 054 return EMPTY_LONGS; 055 } 056 @Override 057 public Iterator<BlockReportReplica> iterator() { 058 return Collections.emptyIterator(); 059 } 060 }; 061 062 /** 063 * Prepare an instance to in-place decode the given ByteString buffer 064 * @param numBlocks - blocks in the buffer 065 * @param blocksBuf - ByteString encoded varints 066 * @return BlockListAsLongs 067 */ 068 public static BlockListAsLongs decodeBuffer(final int numBlocks, 069 final ByteString blocksBuf) { 070 return new BufferDecoder(numBlocks, blocksBuf); 071 } 072 073 /** 074 * Prepare an instance to in-place decode the given ByteString buffers 075 * @param numBlocks - blocks in the buffers 076 * @param blocksBufs - list of ByteString encoded varints 077 * @return BlockListAsLongs 078 */ 079 public static BlockListAsLongs decodeBuffers(final int numBlocks, 080 final List<ByteString> blocksBufs) { 081 // this doesn't actually copy the data 082 return decodeBuffer(numBlocks, ByteString.copyFrom(blocksBufs)); 083 } 084 085 /** 086 * Prepare an instance to in-place decode the given list of Longs. Note 087 * it's much more efficient to decode ByteString buffers and only exists 088 * for compatibility. 089 * @param blocksList - list of longs 090 * @return BlockListAsLongs 091 */ 092 public static BlockListAsLongs decodeLongs(List<Long> blocksList) { 093 return blocksList.isEmpty() ? EMPTY : new LongsDecoder(blocksList); 094 } 095 096 /** 097 * Prepare an instance to encode the collection of replicas into an 098 * efficient ByteString. 099 * @param replicas - replicas to encode 100 * @return BlockListAsLongs 101 */ 102 public static BlockListAsLongs encode( 103 final Collection<? extends Replica> replicas) { 104 BlockListAsLongs.Builder builder = builder(); 105 for (Replica replica : replicas) { 106 builder.add(replica); 107 } 108 return builder.build(); 109 } 110 111 public static Builder builder() { 112 return new BlockListAsLongs.Builder(); 113 } 114 115 /** 116 * The number of blocks 117 * @return - the number of blocks 118 */ 119 abstract public int getNumberOfBlocks(); 120 121 /** 122 * Very efficient encoding of the block report into a ByteString to avoid 123 * the overhead of protobuf repeating fields. Primitive repeating fields 124 * require re-allocs of an ArrayList<Long> and the associated (un)boxing 125 * overhead which puts pressure on GC. 126 * 127 * The structure of the buffer is as follows: 128 * - each replica is represented by 4 longs: 129 * blockId, block length, genstamp, replica state 130 * 131 * @return ByteString encoded block report 132 */ 133 abstract public ByteString getBlocksBuffer(); 134 135 /** 136 * List of ByteStrings that encode this block report 137 * 138 * @return ByteStrings 139 */ 140 public List<ByteString> getBlocksBuffers() { 141 final ByteString blocksBuf = getBlocksBuffer(); 142 final List<ByteString> buffers; 143 final int size = blocksBuf.size(); 144 if (size <= CHUNK_SIZE) { 145 buffers = Collections.singletonList(blocksBuf); 146 } else { 147 buffers = new ArrayList<ByteString>(); 148 for (int pos=0; pos < size; pos += CHUNK_SIZE) { 149 // this doesn't actually copy the data 150 buffers.add(blocksBuf.substring(pos, Math.min(pos+CHUNK_SIZE, size))); 151 } 152 } 153 return buffers; 154 } 155 156 /** 157 * Convert block report to old-style list of longs. Only used to 158 * re-encode the block report when the DN detects an older NN. This is 159 * inefficient, but in practice a DN is unlikely to be upgraded first 160 * 161 * The structure of the array is as follows: 162 * 0: the length of the finalized replica list; 163 * 1: the length of the under-construction replica list; 164 * - followed by finalized replica list where each replica is represented by 165 * 3 longs: one for the blockId, one for the block length, and one for 166 * the generation stamp; 167 * - followed by the invalid replica represented with three -1s; 168 * - followed by the under-construction replica list where each replica is 169 * represented by 4 longs: three for the block id, length, generation 170 * stamp, and the fourth for the replica state. 171 * @return list of longs 172 */ 173 abstract public long[] getBlockListAsLongs(); 174 175 /** 176 * Returns a singleton iterator over blocks in the block report. Do not 177 * add the returned blocks to a collection. 178 * @return Iterator 179 */ 180 abstract public Iterator<BlockReportReplica> iterator(); 181 182 public static class Builder { 183 private final ByteString.Output out; 184 private final CodedOutputStream cos; 185 private int numBlocks = 0; 186 private int numFinalized = 0; 187 188 Builder() { 189 out = ByteString.newOutput(64*1024); 190 cos = CodedOutputStream.newInstance(out); 191 } 192 193 public void add(Replica replica) { 194 try { 195 // zig-zag to reduce size of legacy blocks 196 cos.writeSInt64NoTag(replica.getBlockId()); 197 cos.writeRawVarint64(replica.getBytesOnDisk()); 198 cos.writeRawVarint64(replica.getGenerationStamp()); 199 ReplicaState state = replica.getState(); 200 // although state is not a 64-bit value, using a long varint to 201 // allow for future use of the upper bits 202 cos.writeRawVarint64(state.getValue()); 203 if (state == ReplicaState.FINALIZED) { 204 numFinalized++; 205 } 206 numBlocks++; 207 } catch (IOException ioe) { 208 // shouldn't happen, ByteString.Output doesn't throw IOE 209 throw new IllegalStateException(ioe); 210 } 211 } 212 213 public int getNumberOfBlocks() { 214 return numBlocks; 215 } 216 217 public BlockListAsLongs build() { 218 try { 219 cos.flush(); 220 } catch (IOException ioe) { 221 // shouldn't happen, ByteString.Output doesn't throw IOE 222 throw new IllegalStateException(ioe); 223 } 224 return new BufferDecoder(numBlocks, numFinalized, out.toByteString()); 225 } 226 } 227 228 // decode new-style ByteString buffer based block report 229 private static class BufferDecoder extends BlockListAsLongs { 230 // reserve upper bits for future use. decoding masks off these bits to 231 // allow compatibility for the current through future release that may 232 // start using the bits 233 private static long NUM_BYTES_MASK = (-1L) >>> (64 - 48); 234 private static long REPLICA_STATE_MASK = (-1L) >>> (64 - 4); 235 236 private final ByteString buffer; 237 private final int numBlocks; 238 private int numFinalized; 239 240 BufferDecoder(final int numBlocks, final ByteString buf) { 241 this(numBlocks, -1, buf); 242 } 243 244 BufferDecoder(final int numBlocks, final int numFinalized, 245 final ByteString buf) { 246 this.numBlocks = numBlocks; 247 this.numFinalized = numFinalized; 248 this.buffer = buf; 249 } 250 251 @Override 252 public int getNumberOfBlocks() { 253 return numBlocks; 254 } 255 256 @Override 257 public ByteString getBlocksBuffer() { 258 return buffer; 259 } 260 261 @Override 262 public long[] getBlockListAsLongs() { 263 // terribly inefficient but only occurs if server tries to transcode 264 // an undecoded buffer into longs - ie. it will never happen but let's 265 // handle it anyway 266 if (numFinalized == -1) { 267 int n = 0; 268 for (Replica replica : this) { 269 if (replica.getState() == ReplicaState.FINALIZED) { 270 n++; 271 } 272 } 273 numFinalized = n; 274 } 275 int numUc = numBlocks - numFinalized; 276 int size = 2 + 3*(numFinalized+1) + 4*(numUc); 277 long[] longs = new long[size]; 278 longs[0] = numFinalized; 279 longs[1] = numUc; 280 281 int idx = 2; 282 int ucIdx = idx + 3*numFinalized; 283 // delimiter block 284 longs[ucIdx++] = -1; 285 longs[ucIdx++] = -1; 286 longs[ucIdx++] = -1; 287 288 for (BlockReportReplica block : this) { 289 switch (block.getState()) { 290 case FINALIZED: { 291 longs[idx++] = block.getBlockId(); 292 longs[idx++] = block.getNumBytes(); 293 longs[idx++] = block.getGenerationStamp(); 294 break; 295 } 296 default: { 297 longs[ucIdx++] = block.getBlockId(); 298 longs[ucIdx++] = block.getNumBytes(); 299 longs[ucIdx++] = block.getGenerationStamp(); 300 longs[ucIdx++] = block.getState().getValue(); 301 break; 302 } 303 } 304 } 305 return longs; 306 } 307 308 @Override 309 public Iterator<BlockReportReplica> iterator() { 310 return new Iterator<BlockReportReplica>() { 311 final BlockReportReplica block = new BlockReportReplica(); 312 final CodedInputStream cis = buffer.newCodedInput(); 313 private int currentBlockIndex = 0; 314 315 @Override 316 public boolean hasNext() { 317 return currentBlockIndex < numBlocks; 318 } 319 320 @Override 321 public BlockReportReplica next() { 322 currentBlockIndex++; 323 try { 324 // zig-zag to reduce size of legacy blocks and mask off bits 325 // we don't (yet) understand 326 block.setBlockId(cis.readSInt64()); 327 block.setNumBytes(cis.readRawVarint64() & NUM_BYTES_MASK); 328 block.setGenerationStamp(cis.readRawVarint64()); 329 long state = cis.readRawVarint64() & REPLICA_STATE_MASK; 330 block.setState(ReplicaState.getState((int)state)); 331 } catch (IOException e) { 332 throw new IllegalStateException(e); 333 } 334 return block; 335 } 336 337 @Override 338 public void remove() { 339 throw new UnsupportedOperationException(); 340 } 341 }; 342 } 343 } 344 345 // decode old style block report of longs 346 private static class LongsDecoder extends BlockListAsLongs { 347 private final List<Long> values; 348 private final int finalizedBlocks; 349 private final int numBlocks; 350 351 // set the header 352 LongsDecoder(List<Long> values) { 353 this.values = values.subList(2, values.size()); 354 this.finalizedBlocks = values.get(0).intValue(); 355 this.numBlocks = finalizedBlocks + values.get(1).intValue(); 356 } 357 358 @Override 359 public int getNumberOfBlocks() { 360 return numBlocks; 361 } 362 363 @Override 364 public ByteString getBlocksBuffer() { 365 Builder builder = builder(); 366 for (Replica replica : this) { 367 builder.add(replica); 368 } 369 return builder.build().getBlocksBuffer(); 370 } 371 372 @Override 373 public long[] getBlockListAsLongs() { 374 long[] longs = new long[2+values.size()]; 375 longs[0] = finalizedBlocks; 376 longs[1] = numBlocks - finalizedBlocks; 377 for (int i=0; i < longs.length; i++) { 378 longs[i] = values.get(i); 379 } 380 return longs; 381 } 382 383 @Override 384 public Iterator<BlockReportReplica> iterator() { 385 return new Iterator<BlockReportReplica>() { 386 private final BlockReportReplica block = new BlockReportReplica(); 387 final Iterator<Long> iter = values.iterator(); 388 private int currentBlockIndex = 0; 389 390 @Override 391 public boolean hasNext() { 392 return currentBlockIndex < numBlocks; 393 } 394 395 @Override 396 public BlockReportReplica next() { 397 if (currentBlockIndex == finalizedBlocks) { 398 // verify the presence of the delimiter block 399 readBlock(); 400 Preconditions.checkArgument(block.getBlockId() == -1 && 401 block.getNumBytes() == -1 && 402 block.getGenerationStamp() == -1, 403 "Invalid delimiter block"); 404 } 405 406 readBlock(); 407 if (currentBlockIndex++ < finalizedBlocks) { 408 block.setState(ReplicaState.FINALIZED); 409 } else { 410 block.setState(ReplicaState.getState(iter.next().intValue())); 411 } 412 return block; 413 } 414 415 private void readBlock() { 416 block.setBlockId(iter.next()); 417 block.setNumBytes(iter.next()); 418 block.setGenerationStamp(iter.next()); 419 } 420 421 @Override 422 public void remove() { 423 throw new UnsupportedOperationException(); 424 } 425 }; 426 } 427 } 428 429 @InterfaceAudience.Private 430 public static class BlockReportReplica extends Block implements Replica { 431 private ReplicaState state; 432 private BlockReportReplica() { 433 } 434 public BlockReportReplica(Block block) { 435 super(block); 436 if (block instanceof BlockReportReplica) { 437 this.state = ((BlockReportReplica)block).getState(); 438 } else { 439 this.state = ReplicaState.FINALIZED; 440 } 441 } 442 public void setState(ReplicaState state) { 443 this.state = state; 444 } 445 @Override 446 public ReplicaState getState() { 447 return state; 448 } 449 @Override 450 public long getBytesOnDisk() { 451 return getNumBytes(); 452 } 453 @Override 454 public long getVisibleLength() { 455 throw new UnsupportedOperationException(); 456 } 457 @Override 458 public String getStorageUuid() { 459 throw new UnsupportedOperationException(); 460 } 461 @Override 462 public boolean isOnTransientStorage() { 463 throw new UnsupportedOperationException(); 464 } 465 @Override 466 public boolean equals(Object o) { 467 return super.equals(o); 468 } 469 @Override 470 public int hashCode() { 471 return super.hashCode(); 472 } 473 } 474}