001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs.server.datanode.fsdataset;
019
020
021import java.io.EOFException;
022import java.io.File;
023import java.io.FileDescriptor;
024import java.io.FileNotFoundException;
025import java.io.IOException;
026import java.io.InputStream;
027import java.util.Collection;
028import java.util.List;
029import java.util.Map;
030import java.util.Set;
031
032import org.apache.hadoop.classification.InterfaceAudience;
033import org.apache.hadoop.conf.Configuration;
034import org.apache.hadoop.fs.StorageType;
035import org.apache.hadoop.hdfs.DFSConfigKeys;
036import org.apache.hadoop.hdfs.protocol.Block;
037import org.apache.hadoop.hdfs.protocol.BlockListAsLongs;
038import org.apache.hadoop.hdfs.protocol.BlockLocalPathInfo;
039import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
040import org.apache.hadoop.hdfs.protocol.HdfsBlocksMetadata;
041import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState;
042import org.apache.hadoop.hdfs.server.datanode.DataNode;
043import org.apache.hadoop.hdfs.server.datanode.DataStorage;
044import org.apache.hadoop.hdfs.server.datanode.FinalizedReplica;
045import org.apache.hadoop.hdfs.server.datanode.Replica;
046import org.apache.hadoop.hdfs.server.datanode.ReplicaInPipelineInterface;
047import org.apache.hadoop.hdfs.server.datanode.ReplicaHandler;
048import org.apache.hadoop.hdfs.server.datanode.ReplicaInfo;
049import org.apache.hadoop.hdfs.server.datanode.ReplicaNotFoundException;
050import org.apache.hadoop.hdfs.server.datanode.StorageLocation;
051import org.apache.hadoop.hdfs.server.datanode.UnexpectedReplicaStateException;
052import org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetFactory;
053import org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsVolumeImpl;
054import org.apache.hadoop.hdfs.server.datanode.metrics.FSDatasetMBean;
055import org.apache.hadoop.hdfs.server.protocol.BlockRecoveryCommand.RecoveringBlock;
056import org.apache.hadoop.hdfs.server.protocol.DatanodeStorage;
057import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
058import org.apache.hadoop.hdfs.server.protocol.ReplicaRecoveryInfo;
059import org.apache.hadoop.hdfs.server.protocol.StorageReport;
060import org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary;
061import org.apache.hadoop.util.DiskChecker.DiskErrorException;
062import org.apache.hadoop.util.ReflectionUtils;
063
064/**
065 * This is a service provider interface for the underlying storage that
066 * stores replicas for a data node.
067 * The default implementation stores replicas on local drives. 
068 */
069@InterfaceAudience.Private
070public interface FsDatasetSpi<V extends FsVolumeSpi> extends FSDatasetMBean {
071  /**
072   * A factory for creating {@link FsDatasetSpi} objects.
073   */
074  public static abstract class Factory<D extends FsDatasetSpi<?>> {
075    /** @return the configured factory. */
076    public static Factory<?> getFactory(Configuration conf) {
077      @SuppressWarnings("rawtypes")
078      final Class<? extends Factory> clazz = conf.getClass(
079          DFSConfigKeys.DFS_DATANODE_FSDATASET_FACTORY_KEY,
080          FsDatasetFactory.class,
081          Factory.class);
082      return ReflectionUtils.newInstance(clazz, conf);
083    }
084
085    /** Create a new object. */
086    public abstract D newInstance(DataNode datanode, DataStorage storage,
087        Configuration conf) throws IOException;
088
089    /** Does the factory create simulated objects? */
090    public boolean isSimulated() {
091      return false;
092    }
093  }
094
095  /** @return a list of volumes. */
096  public List<V> getVolumes();
097
098  /**
099   * Add a new volume to the FsDataset.<p/>
100   *
101   * If the FSDataset supports block scanning, this function registers
102   * the new volume with the block scanner.
103   *
104   * @param location      The storage location for the new volume.
105   * @param nsInfos       Namespace information for the new volume.
106   */
107  public void addVolume(
108      final StorageLocation location,
109      final List<NamespaceInfo> nsInfos) throws IOException;
110
111  /**
112   * Removes a collection of volumes from FsDataset.
113   *
114   * If the FSDataset supports block scanning, this function removes
115   * the volumes from the block scanner.
116   *
117   * @param volumes  The paths of the volumes to be removed.
118   * @param clearFailure set true to clear the failure information about the
119   *                     volumes.
120   */
121  public void removeVolumes(Set<File> volumes, boolean clearFailure);
122
123  /** @return a storage with the given storage ID */
124  public DatanodeStorage getStorage(final String storageUuid);
125
126  /** @return one or more storage reports for attached volumes. */
127  public StorageReport[] getStorageReports(String bpid)
128      throws IOException;
129
130  /** @return the volume that contains a replica of the block. */
131  public V getVolume(ExtendedBlock b);
132
133  /** @return a volume information map (name => info). */
134  public Map<String, Object> getVolumeInfoMap();
135
136  /**
137   * Returns info about volume failures.
138   *
139   * @return info about volume failures, possibly null
140   */
141  VolumeFailureSummary getVolumeFailureSummary();
142
143  /** @return a list of finalized blocks for the given block pool. */
144  public List<FinalizedReplica> getFinalizedBlocks(String bpid);
145
146  /** @return a list of finalized blocks for the given block pool. */
147  public List<FinalizedReplica> getFinalizedBlocksOnPersistentStorage(String bpid);
148
149  /**
150   * Check whether the in-memory block record matches the block on the disk,
151   * and, in case that they are not matched, update the record or mark it
152   * as corrupted.
153   */
154  public void checkAndUpdate(String bpid, long blockId, File diskFile,
155      File diskMetaFile, FsVolumeSpi vol) throws IOException;
156
157  /**
158   * @param b - the block
159   * @return a stream if the meta-data of the block exists;
160   *         otherwise, return null.
161   * @throws IOException
162   */
163  public LengthInputStream getMetaDataInputStream(ExtendedBlock b
164      ) throws IOException;
165
166  /**
167   * Returns the specified block's on-disk length (excluding metadata)
168   * @return   the specified block's on-disk length (excluding metadta)
169   * @throws IOException on error
170   */
171  public long getLength(ExtendedBlock b) throws IOException;
172
173  /**
174   * Get reference to the replica meta info in the replicasMap. 
175   * To be called from methods that are synchronized on {@link FSDataset}
176   * @return replica from the replicas map
177   */
178  @Deprecated
179  public Replica getReplica(String bpid, long blockId);
180
181  /**
182   * @return replica meta information
183   */
184  public String getReplicaString(String bpid, long blockId);
185
186  /**
187   * @return the generation stamp stored with the block.
188   */
189  public Block getStoredBlock(String bpid, long blkid) throws IOException;
190  
191  /**
192   * Returns an input stream at specified offset of the specified block
193   * @param b block
194   * @param seekOffset offset with in the block to seek to
195   * @return an input stream to read the contents of the specified block,
196   *  starting at the offset
197   * @throws IOException
198   */
199  public InputStream getBlockInputStream(ExtendedBlock b, long seekOffset)
200            throws IOException;
201
202  /**
203   * Returns an input stream at specified offset of the specified block
204   * The block is still in the tmp directory and is not finalized
205   * @return an input stream to read the contents of the specified block,
206   *  starting at the offset
207   * @throws IOException
208   */
209  public ReplicaInputStreams getTmpInputStreams(ExtendedBlock b, long blkoff,
210      long ckoff) throws IOException;
211
212  /**
213   * Creates a temporary replica and returns the meta information of the replica
214   * @param b block
215   * @param isTransfer whether for transfer
216   *
217   * @return the meta info of the replica which is being written to
218   * @throws IOException if an error occurs
219   */
220  public ReplicaHandler createTemporary(StorageType storageType,
221      ExtendedBlock b, boolean isTransfer) throws IOException;
222
223  /**
224   * Creates a RBW replica and returns the meta info of the replica
225   * 
226   * @param b block
227   * @return the meta info of the replica which is being written to
228   * @throws IOException if an error occurs
229   */
230  public ReplicaHandler createRbw(StorageType storageType,
231      ExtendedBlock b, boolean allowLazyPersist) throws IOException;
232
233  /**
234   * Recovers a RBW replica and returns the meta info of the replica
235   * 
236   * @param b block
237   * @param newGS the new generation stamp for the replica
238   * @param minBytesRcvd the minimum number of bytes that the replica could have
239   * @param maxBytesRcvd the maximum number of bytes that the replica could have
240   * @return the meta info of the replica which is being written to
241   * @throws IOException if an error occurs
242   */
243  public ReplicaHandler recoverRbw(ExtendedBlock b,
244      long newGS, long minBytesRcvd, long maxBytesRcvd) throws IOException;
245
246  /**
247   * Covert a temporary replica to a RBW.
248   * @param temporary the temporary replica being converted
249   * @return the result RBW
250   */
251  public ReplicaInPipelineInterface convertTemporaryToRbw(
252      ExtendedBlock temporary) throws IOException;
253
254  /**
255   * Append to a finalized replica and returns the meta info of the replica
256   * 
257   * @param b block
258   * @param newGS the new generation stamp for the replica
259   * @param expectedBlockLen the number of bytes the replica is expected to have
260   * @return the meata info of the replica which is being written to
261   * @throws IOException
262   */
263  public ReplicaHandler append(ExtendedBlock b, long newGS,
264      long expectedBlockLen) throws IOException;
265
266  /**
267   * Recover a failed append to a finalized replica
268   * and returns the meta info of the replica
269   * 
270   * @param b block
271   * @param newGS the new generation stamp for the replica
272   * @param expectedBlockLen the number of bytes the replica is expected to have
273   * @return the meta info of the replica which is being written to
274   * @throws IOException
275   */
276  public ReplicaHandler recoverAppend(
277      ExtendedBlock b, long newGS, long expectedBlockLen) throws IOException;
278  
279  /**
280   * Recover a failed pipeline close
281   * It bumps the replica's generation stamp and finalize it if RBW replica
282   * 
283   * @param b block
284   * @param newGS the new generation stamp for the replica
285   * @param expectedBlockLen the number of bytes the replica is expected to have
286   * @return the storage uuid of the replica.
287   * @throws IOException
288   */
289  Replica recoverClose(ExtendedBlock b, long newGS, long expectedBlockLen
290      ) throws IOException;
291  
292  /**
293   * Finalizes the block previously opened for writing using writeToBlock.
294   * The block size is what is in the parameter b and it must match the amount
295   *  of data written
296   * @param block Block to be finalized
297   * @param fsyncDir whether to sync the directory changes to durable device.
298   * @throws IOException
299   * @throws ReplicaNotFoundException if the replica can not be found when the
300   * block is been finalized. For instance, the block resides on an HDFS volume
301   * that has been removed.
302   */
303  void finalizeBlock(ExtendedBlock b, boolean fsyncDir) throws IOException;
304
305  /**
306   * Unfinalizes the block previously opened for writing using writeToBlock.
307   * The temporary file associated with this block is deleted.
308   * @throws IOException
309   */
310  public void unfinalizeBlock(ExtendedBlock b) throws IOException;
311
312  /**
313   * Returns one block report per volume.
314   * @param bpid Block Pool Id
315   * @return - a map of DatanodeStorage to block report for the volume.
316   */
317  public Map<DatanodeStorage, BlockListAsLongs> getBlockReports(String bpid);
318
319  /**
320   * Returns the cache report - the full list of cached block IDs of a
321   * block pool.
322   * @param   bpid Block Pool Id
323   * @return  the cache report - the full list of cached block IDs.
324   */
325  public List<Long> getCacheReport(String bpid);
326
327  /** Does the dataset contain the block? */
328  public boolean contains(ExtendedBlock block);
329
330  /**
331   * Check if a block is valid.
332   *
333   * @param b           The block to check.
334   * @param minLength   The minimum length that the block must have.  May be 0.
335   * @param state       If this is null, it is ignored.  If it is non-null, we
336   *                        will check that the replica has this state.
337   *
338   * @throws ReplicaNotFoundException          If the replica is not found
339   *
340   * @throws UnexpectedReplicaStateException   If the replica is not in the 
341   *                                             expected state.
342   * @throws FileNotFoundException             If the block file is not found or there 
343   *                                              was an error locating it.
344   * @throws EOFException                      If the replica length is too short.
345   * 
346   * @throws IOException                       May be thrown from the methods called. 
347   */
348  public void checkBlock(ExtendedBlock b, long minLength, ReplicaState state)
349      throws ReplicaNotFoundException, UnexpectedReplicaStateException,
350      FileNotFoundException, EOFException, IOException;
351      
352  
353  /**
354   * Is the block valid?
355   * @return - true if the specified block is valid
356   */
357  public boolean isValidBlock(ExtendedBlock b);
358
359  /**
360   * Is the block a valid RBW?
361   * @return - true if the specified block is a valid RBW
362   */
363  public boolean isValidRbw(ExtendedBlock b);
364
365  /**
366   * Invalidates the specified blocks
367   * @param bpid Block pool Id
368   * @param invalidBlks - the blocks to be invalidated
369   * @throws IOException
370   */
371  public void invalidate(String bpid, Block invalidBlks[]) throws IOException;
372
373  /**
374   * Caches the specified blocks
375   * @param bpid Block pool id
376   * @param blockIds - block ids to cache
377   */
378  public void cache(String bpid, long[] blockIds);
379
380  /**
381   * Uncaches the specified blocks
382   * @param bpid Block pool id
383   * @param blockIds - blocks ids to uncache
384   */
385  public void uncache(String bpid, long[] blockIds);
386
387  /**
388   * Determine if the specified block is cached.
389   * @param bpid Block pool id
390   * @param blockIds - block id
391   * @return true if the block is cached
392   */
393  public boolean isCached(String bpid, long blockId);
394
395    /**
396     * Check if all the data directories are healthy
397     * @return A set of unhealthy data directories.
398     */
399  public Set<File> checkDataDir();
400
401  /**
402   * Shutdown the FSDataset
403   */
404  public void shutdown();
405
406  /**
407   * Sets the file pointer of the checksum stream so that the last checksum
408   * will be overwritten
409   * @param b block
410   * @param outs The streams for the data file and checksum file
411   * @param checksumSize number of bytes each checksum has
412   * @throws IOException
413   */
414  public void adjustCrcChannelPosition(ExtendedBlock b,
415      ReplicaOutputStreams outs, int checksumSize) throws IOException;
416
417  /**
418   * Checks how many valid storage volumes there are in the DataNode.
419   * @return true if more than the minimum number of valid volumes are left 
420   * in the FSDataSet.
421   */
422  public boolean hasEnoughResource();
423
424  /**
425   * Get visible length of the specified replica.
426   */
427  long getReplicaVisibleLength(final ExtendedBlock block) throws IOException;
428
429  /**
430   * Initialize a replica recovery.
431   * @return actual state of the replica on this data-node or 
432   * null if data-node does not have the replica.
433   */
434  public ReplicaRecoveryInfo initReplicaRecovery(RecoveringBlock rBlock
435      ) throws IOException;
436
437  /**
438   * Update replica's generation stamp and length and finalize it.
439   * @return the ID of storage that stores the block
440   */
441  Replica updateReplicaUnderRecovery(ExtendedBlock oldBlock,
442      long recoveryId, long newBlockId, long newLength) throws IOException;
443
444  /**
445   * add new block pool ID
446   * @param bpid Block pool Id
447   * @param conf Configuration
448   */
449  public void addBlockPool(String bpid, Configuration conf) throws IOException;
450  
451  /**
452   * Shutdown and remove the block pool from underlying storage.
453   * @param bpid Block pool Id to be removed
454   */
455  public void shutdownBlockPool(String bpid) ;
456  
457  /**
458   * Deletes the block pool directories. If force is false, directories are 
459   * deleted only if no block files exist for the block pool. If force 
460   * is true entire directory for the blockpool is deleted along with its
461   * contents.
462   * @param bpid BlockPool Id to be deleted.
463   * @param force If force is false, directories are deleted only if no
464   *        block files exist for the block pool, otherwise entire 
465   *        directory for the blockpool is deleted along with its contents.
466   * @throws IOException
467   */
468  public void deleteBlockPool(String bpid, boolean force) throws IOException;
469  
470  /**
471   * Get {@link BlockLocalPathInfo} for the given block.
472   */
473  public BlockLocalPathInfo getBlockLocalPathInfo(ExtendedBlock b
474      ) throws IOException;
475
476  /**
477   * Get a {@link HdfsBlocksMetadata} corresponding to the list of blocks in 
478   * <code>blocks</code>.
479   * 
480   * @param bpid pool to query
481   * @param blockIds List of block ids for which to return metadata
482   * @return metadata Metadata for the list of blocks
483   * @throws IOException
484   */
485  public HdfsBlocksMetadata getHdfsBlocksMetadata(String bpid,
486      long[] blockIds) throws IOException;
487
488  /**
489   * Enable 'trash' for the given dataset. When trash is enabled, files are
490   * moved to a separate trash directory instead of being deleted immediately.
491   * This can be useful for example during rolling upgrades.
492   */
493  public void enableTrash(String bpid);
494
495  /**
496   * Clear trash
497   */
498  public void clearTrash(String bpid);
499
500  /**
501   * @return true when trash is enabled
502   */
503  public boolean trashEnabled(String bpid);
504
505  /**
506   * Create a marker file indicating that a rolling upgrade is in progress.
507   */
508  public void setRollingUpgradeMarker(String bpid) throws IOException;
509
510  /**
511   * Delete the rolling upgrade marker file if it exists.
512   * @param bpid
513   */
514  public void clearRollingUpgradeMarker(String bpid) throws IOException;
515
516  /**
517   * submit a sync_file_range request to AsyncDiskService
518   */
519  public void submitBackgroundSyncFileRangeRequest(final ExtendedBlock block,
520      final FileDescriptor fd, final long offset, final long nbytes,
521      final int flags);
522
523  /**
524   * Callback from RamDiskAsyncLazyPersistService upon async lazy persist task end
525   */
526   public void onCompleteLazyPersist(String bpId, long blockId,
527      long creationTime, File[] savedFiles, V targetVolume);
528
529   /**
530    * Callback from RamDiskAsyncLazyPersistService upon async lazy persist task fail
531    */
532   public void onFailLazyPersist(String bpId, long blockId);
533
534    /**
535     * Move block from one storage to another storage
536     */
537    public ReplicaInfo moveBlockAcrossStorage(final ExtendedBlock block,
538        StorageType targetStorageType) throws IOException;
539
540  /**
541   * Set a block to be pinned on this datanode so that it cannot be moved
542   * by Balancer/Mover.
543   *
544   * It is a no-op when dfs.datanode.block-pinning.enabled is set to false.
545   */
546  public void setPinning(ExtendedBlock block) throws IOException;
547
548  /**
549   * Check whether the block was pinned
550   */
551  public boolean getPinning(ExtendedBlock block) throws IOException;
552  
553  /**
554   * Confirm whether the block is deleting
555   */
556  public boolean isDeletingBlock(String bpid, long blockId);
557}