001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs.protocol;
019
020import java.io.IOException;
021import java.util.ArrayList;
022import java.util.Collection;
023import java.util.Collections;
024import java.util.Iterator;
025import java.util.List;
026
027import org.apache.hadoop.classification.InterfaceAudience;
028import org.apache.hadoop.classification.InterfaceStability;
029import org.apache.hadoop.hdfs.protocol.BlockListAsLongs.BlockReportReplica;
030import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState;
031import org.apache.hadoop.hdfs.server.datanode.Replica;
032import com.google.common.base.Preconditions;
033import com.google.protobuf.ByteString;
034import com.google.protobuf.CodedInputStream;
035import com.google.protobuf.CodedOutputStream;
036
037@InterfaceAudience.Private
038@InterfaceStability.Evolving
039public abstract class BlockListAsLongs implements Iterable<BlockReportReplica> {
040  private final static int CHUNK_SIZE = 64*1024; // 64K
041  private static long[] EMPTY_LONGS = new long[]{0, 0};
042
043  public static BlockListAsLongs EMPTY = new BlockListAsLongs() {
044    @Override
045    public int getNumberOfBlocks() {
046      return 0;
047    }
048    @Override
049    public ByteString getBlocksBuffer() {
050      return ByteString.EMPTY;
051    }
052    @Override
053    public long[] getBlockListAsLongs() {
054      return EMPTY_LONGS;
055    }
056    @Override
057    public Iterator<BlockReportReplica> iterator() {
058      return Collections.emptyIterator();
059    }
060  };
061
062  /**
063   * Prepare an instance to in-place decode the given ByteString buffer
064   * @param numBlocks - blocks in the buffer
065   * @param blocksBuf - ByteString encoded varints
066   * @return BlockListAsLongs
067   */
068  public static BlockListAsLongs decodeBuffer(final int numBlocks,
069      final ByteString blocksBuf) {
070    return new BufferDecoder(numBlocks, blocksBuf);
071  }
072
073  /**
074   * Prepare an instance to in-place decode the given ByteString buffers
075   * @param numBlocks - blocks in the buffers
076   * @param blocksBufs - list of ByteString encoded varints
077   * @return BlockListAsLongs
078   */
079  public static BlockListAsLongs decodeBuffers(final int numBlocks,
080      final List<ByteString> blocksBufs) {
081    // this doesn't actually copy the data
082    return decodeBuffer(numBlocks, ByteString.copyFrom(blocksBufs));
083  }
084
085  /**
086   * Prepare an instance to in-place decode the given list of Longs.  Note
087   * it's much more efficient to decode ByteString buffers and only exists
088   * for compatibility.
089   * @param blocksList - list of longs
090   * @return BlockListAsLongs
091   */
092  public static BlockListAsLongs decodeLongs(List<Long> blocksList) {
093    return blocksList.isEmpty() ? EMPTY : new LongsDecoder(blocksList);
094  }
095
096  /**
097   * Prepare an instance to encode the collection of replicas into an
098   * efficient ByteString.
099   * @param replicas - replicas to encode
100   * @return BlockListAsLongs
101   */
102  public static BlockListAsLongs encode(
103      final Collection<? extends Replica> replicas) {
104    BlockListAsLongs.Builder builder = builder();
105    for (Replica replica : replicas) {
106      builder.add(replica);
107    }
108    return builder.build();
109  }
110
111  public static Builder builder() {
112    return new BlockListAsLongs.Builder();
113  }
114
115  /**
116   * The number of blocks
117   * @return - the number of blocks
118   */
119  abstract public int getNumberOfBlocks();
120
121  /**
122   * Very efficient encoding of the block report into a ByteString to avoid
123   * the overhead of protobuf repeating fields.  Primitive repeating fields
124   * require re-allocs of an ArrayList<Long> and the associated (un)boxing
125   * overhead which puts pressure on GC.
126   * 
127   * The structure of the buffer is as follows:
128   * - each replica is represented by 4 longs:
129   *   blockId, block length, genstamp, replica state
130   *
131   * @return ByteString encoded block report
132   */
133  abstract public ByteString getBlocksBuffer();
134
135  /**
136   * List of ByteStrings that encode this block report
137   *
138   * @return ByteStrings
139   */
140  public List<ByteString> getBlocksBuffers() {
141    final ByteString blocksBuf = getBlocksBuffer();
142    final List<ByteString> buffers;
143    final int size = blocksBuf.size();
144    if (size <= CHUNK_SIZE) {
145      buffers = Collections.singletonList(blocksBuf);
146    } else {
147      buffers = new ArrayList<ByteString>();
148      for (int pos=0; pos < size; pos += CHUNK_SIZE) {
149        // this doesn't actually copy the data
150        buffers.add(blocksBuf.substring(pos, Math.min(pos+CHUNK_SIZE, size)));
151      }
152    }
153    return buffers;
154  }
155
156  /**
157   * Convert block report to old-style list of longs.  Only used to
158   * re-encode the block report when the DN detects an older NN. This is
159   * inefficient, but in practice a DN is unlikely to be upgraded first
160   * 
161   * The structure of the array is as follows:
162   * 0: the length of the finalized replica list;
163   * 1: the length of the under-construction replica list;
164   * - followed by finalized replica list where each replica is represented by
165   *   3 longs: one for the blockId, one for the block length, and one for
166   *   the generation stamp;
167   * - followed by the invalid replica represented with three -1s;
168   * - followed by the under-construction replica list where each replica is
169   *   represented by 4 longs: three for the block id, length, generation 
170   *   stamp, and the fourth for the replica state.
171   * @return list of longs
172   */
173  abstract public long[] getBlockListAsLongs();
174
175  /**
176   * Returns a singleton iterator over blocks in the block report.  Do not
177   * add the returned blocks to a collection.
178   * @return Iterator
179   */
180  abstract public Iterator<BlockReportReplica> iterator();
181
182  public static class Builder {
183    private final ByteString.Output out;
184    private final CodedOutputStream cos;
185    private int numBlocks = 0;
186    private int numFinalized = 0;
187
188    Builder() {
189      out = ByteString.newOutput(64*1024);
190      cos = CodedOutputStream.newInstance(out);
191    }
192
193    public void add(Replica replica) {
194      try {
195        // zig-zag to reduce size of legacy blocks
196        cos.writeSInt64NoTag(replica.getBlockId());
197        cos.writeRawVarint64(replica.getBytesOnDisk());
198        cos.writeRawVarint64(replica.getGenerationStamp());
199        ReplicaState state = replica.getState();
200        // although state is not a 64-bit value, using a long varint to
201        // allow for future use of the upper bits
202        cos.writeRawVarint64(state.getValue());
203        if (state == ReplicaState.FINALIZED) {
204          numFinalized++;
205        }
206        numBlocks++;
207      } catch (IOException ioe) {
208        // shouldn't happen, ByteString.Output doesn't throw IOE
209        throw new IllegalStateException(ioe);
210      }
211    }
212
213    public int getNumberOfBlocks() {
214      return numBlocks;
215    }
216    
217    public BlockListAsLongs build() {
218      try {
219        cos.flush();
220      } catch (IOException ioe) {
221        // shouldn't happen, ByteString.Output doesn't throw IOE
222        throw new IllegalStateException(ioe);
223      }
224      return new BufferDecoder(numBlocks, numFinalized, out.toByteString());
225    }
226  }
227
228  // decode new-style ByteString buffer based block report
229  private static class BufferDecoder extends BlockListAsLongs {
230    // reserve upper bits for future use.  decoding masks off these bits to
231    // allow compatibility for the current through future release that may
232    // start using the bits
233    private static long NUM_BYTES_MASK = (-1L) >>> (64 - 48);
234    private static long REPLICA_STATE_MASK = (-1L) >>> (64 - 4);
235
236    private final ByteString buffer;
237    private final int numBlocks;
238    private int numFinalized;
239
240    BufferDecoder(final int numBlocks, final ByteString buf) {
241      this(numBlocks, -1, buf);
242    }
243
244    BufferDecoder(final int numBlocks, final int numFinalized,
245        final ByteString buf) {
246      this.numBlocks = numBlocks;
247      this.numFinalized = numFinalized;
248      this.buffer = buf;
249    }
250
251    @Override
252    public int getNumberOfBlocks() {
253      return numBlocks;
254    }
255
256    @Override
257    public ByteString getBlocksBuffer() {
258      return buffer;
259    }
260
261    @Override
262    public long[] getBlockListAsLongs() {
263      // terribly inefficient but only occurs if server tries to transcode
264      // an undecoded buffer into longs - ie. it will never happen but let's
265      // handle it anyway
266      if (numFinalized == -1) {
267        int n = 0;
268        for (Replica replica : this) {
269          if (replica.getState() == ReplicaState.FINALIZED) {
270            n++;
271          }
272        }
273        numFinalized = n;
274      }
275      int numUc = numBlocks - numFinalized;
276      int size = 2 + 3*(numFinalized+1) + 4*(numUc);
277      long[] longs = new long[size];
278      longs[0] = numFinalized;
279      longs[1] = numUc;
280
281      int idx = 2;
282      int ucIdx = idx + 3*numFinalized;
283      // delimiter block
284      longs[ucIdx++] = -1;
285      longs[ucIdx++] = -1;
286      longs[ucIdx++] = -1;
287
288      for (BlockReportReplica block : this) {
289        switch (block.getState()) {
290          case FINALIZED: {
291            longs[idx++] = block.getBlockId();
292            longs[idx++] = block.getNumBytes();
293            longs[idx++] = block.getGenerationStamp();
294            break;
295          }
296          default: {
297            longs[ucIdx++] = block.getBlockId();
298            longs[ucIdx++] = block.getNumBytes();
299            longs[ucIdx++] = block.getGenerationStamp();
300            longs[ucIdx++] = block.getState().getValue();
301            break;
302          }
303        }
304      }
305      return longs;
306    }
307
308    @Override
309    public Iterator<BlockReportReplica> iterator() {
310      return new Iterator<BlockReportReplica>() {
311        final BlockReportReplica block = new BlockReportReplica();
312        final CodedInputStream cis = buffer.newCodedInput();
313        private int currentBlockIndex = 0;
314
315        @Override
316        public boolean hasNext() {
317          return currentBlockIndex < numBlocks;
318        }
319
320        @Override
321        public BlockReportReplica next() {
322          currentBlockIndex++;
323          try {
324            // zig-zag to reduce size of legacy blocks and mask off bits
325            // we don't (yet) understand
326            block.setBlockId(cis.readSInt64());
327            block.setNumBytes(cis.readRawVarint64() & NUM_BYTES_MASK);
328            block.setGenerationStamp(cis.readRawVarint64());
329            long state = cis.readRawVarint64() & REPLICA_STATE_MASK;
330            block.setState(ReplicaState.getState((int)state));
331          } catch (IOException e) {
332            throw new IllegalStateException(e);
333          }
334          return block;
335        }
336
337        @Override
338        public void remove() {
339          throw new UnsupportedOperationException();
340        }
341      };
342    }
343  }
344
345  // decode old style block report of longs
346  private static class LongsDecoder extends BlockListAsLongs {
347    private final List<Long> values;
348    private final int finalizedBlocks;
349    private final int numBlocks;
350
351    // set the header
352    LongsDecoder(List<Long> values) {
353      this.values = values.subList(2, values.size());
354      this.finalizedBlocks = values.get(0).intValue();
355      this.numBlocks = finalizedBlocks + values.get(1).intValue();
356    }
357
358    @Override
359    public int getNumberOfBlocks() {
360      return numBlocks;
361    }
362
363    @Override
364    public ByteString getBlocksBuffer() {
365      Builder builder = builder();
366      for (Replica replica : this) {
367        builder.add(replica);
368      }
369      return builder.build().getBlocksBuffer();
370    }
371
372    @Override
373    public long[] getBlockListAsLongs() {
374      long[] longs = new long[2+values.size()];
375      longs[0] = finalizedBlocks;
376      longs[1] = numBlocks - finalizedBlocks;
377      for (int i=0; i < longs.length; i++) {
378        longs[i] = values.get(i);
379      }
380      return longs;
381    }
382
383    @Override
384    public Iterator<BlockReportReplica> iterator() {
385      return new Iterator<BlockReportReplica>() {
386        private final BlockReportReplica block = new BlockReportReplica();
387        final Iterator<Long> iter = values.iterator();
388        private int currentBlockIndex = 0;
389
390        @Override
391        public boolean hasNext() {
392          return currentBlockIndex < numBlocks;
393        }
394
395        @Override
396        public BlockReportReplica next() {
397          if (currentBlockIndex == finalizedBlocks) {
398            // verify the presence of the delimiter block
399            readBlock();
400            Preconditions.checkArgument(block.getBlockId() == -1 &&
401                                        block.getNumBytes() == -1 &&
402                                        block.getGenerationStamp() == -1,
403                                        "Invalid delimiter block");
404          }
405
406          readBlock();
407          if (currentBlockIndex++ < finalizedBlocks) {
408            block.setState(ReplicaState.FINALIZED);
409          } else {
410            block.setState(ReplicaState.getState(iter.next().intValue()));
411          }
412          return block;
413        }
414
415        private void readBlock() {
416          block.setBlockId(iter.next());
417          block.setNumBytes(iter.next());
418          block.setGenerationStamp(iter.next());
419        }
420
421        @Override
422        public void remove() {
423          throw new UnsupportedOperationException();
424        }
425      };
426    }
427  }
428  
429  @InterfaceAudience.Private
430  public static class BlockReportReplica extends Block implements Replica {
431    private ReplicaState state;
432    private BlockReportReplica() {
433    }
434    public BlockReportReplica(Block block) {
435      super(block);
436      if (block instanceof BlockReportReplica) {
437        this.state = ((BlockReportReplica)block).getState();
438      } else {
439        this.state = ReplicaState.FINALIZED;
440      }
441    }
442    public void setState(ReplicaState state) {
443      this.state = state;
444    }
445    @Override
446    public ReplicaState getState() {
447      return state;
448    }
449    @Override
450    public long getBytesOnDisk() {
451      return getNumBytes();
452    }
453    @Override
454    public long getVisibleLength() {
455      throw new UnsupportedOperationException();
456    }
457    @Override
458    public String getStorageUuid() {
459      throw new UnsupportedOperationException();
460    }
461    @Override
462    public boolean isOnTransientStorage() {
463      throw new UnsupportedOperationException();
464    }
465    @Override
466    public boolean equals(Object o) {
467      return super.equals(o);
468    }
469    @Override
470    public int hashCode() {
471      return super.hashCode();
472    }
473  }
474}