001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs.server.namenode;
019
020import static org.apache.hadoop.util.Time.monotonicNow;
021
022import java.io.DataInput;
023import java.io.DataInputStream;
024import java.io.DataOutputStream;
025import java.io.File;
026import java.io.FileInputStream;
027import java.io.FileNotFoundException;
028import java.io.FileOutputStream;
029import java.io.IOException;
030import java.security.DigestInputStream;
031import java.security.DigestOutputStream;
032import java.security.MessageDigest;
033import java.util.ArrayList;
034import java.util.Arrays;
035import java.util.Collection;
036import java.util.HashMap;
037import java.util.List;
038import java.util.Map;
039import java.util.TreeMap;
040
041import org.apache.commons.logging.Log;
042import org.apache.hadoop.classification.InterfaceAudience;
043import org.apache.hadoop.classification.InterfaceStability;
044import org.apache.hadoop.conf.Configuration;
045import org.apache.hadoop.fs.FileSystem;
046import org.apache.hadoop.fs.Path;
047import org.apache.hadoop.fs.PathIsNotDirectoryException;
048import org.apache.hadoop.fs.UnresolvedLinkException;
049import org.apache.hadoop.fs.permission.PermissionStatus;
050import org.apache.hadoop.hdfs.DFSUtil;
051import org.apache.hadoop.hdfs.protocol.HdfsConstants;
052import org.apache.hadoop.hdfs.protocol.LayoutFlags;
053import org.apache.hadoop.hdfs.protocol.LayoutVersion;
054import org.apache.hadoop.hdfs.protocol.LayoutVersion.Feature;
055import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoContiguous;
056import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoContiguousUnderConstruction;
057import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager;
058import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption;
059import org.apache.hadoop.hdfs.server.common.InconsistentFSStateException;
060import org.apache.hadoop.hdfs.server.namenode.snapshot.DirectoryWithSnapshotFeature;
061import org.apache.hadoop.hdfs.server.namenode.snapshot.FileDiffList;
062import org.apache.hadoop.hdfs.server.namenode.snapshot.Snapshot;
063import org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotFSImageFormat;
064import org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotFSImageFormat.ReferenceMap;
065import org.apache.hadoop.hdfs.server.namenode.startupprogress.Phase;
066import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress;
067import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress.Counter;
068import org.apache.hadoop.hdfs.server.namenode.startupprogress.Step;
069import org.apache.hadoop.hdfs.server.namenode.startupprogress.StepType;
070import org.apache.hadoop.hdfs.util.ReadOnlyList;
071import org.apache.hadoop.io.IOUtils;
072import org.apache.hadoop.io.MD5Hash;
073import org.apache.hadoop.io.Text;
074import org.apache.hadoop.util.StringUtils;
075
076import com.google.common.annotations.VisibleForTesting;
077import com.google.common.base.Preconditions;
078
079/**
080 * Contains inner classes for reading or writing the on-disk format for
081 * FSImages.
082 *
083 * In particular, the format of the FSImage looks like:
084 * <pre>
085 * FSImage {
086 *   layoutVersion: int, namespaceID: int, numberItemsInFSDirectoryTree: long,
087 *   namesystemGenerationStampV1: long, namesystemGenerationStampV2: long,
088 *   generationStampAtBlockIdSwitch:long, lastAllocatedBlockId:
089 *   long transactionID: long, snapshotCounter: int, numberOfSnapshots: int,
090 *   numOfSnapshottableDirs: int,
091 *   {FSDirectoryTree, FilesUnderConstruction, SecretManagerState} (can be compressed)
092 * }
093 *
094 * FSDirectoryTree (if {@link Feature#FSIMAGE_NAME_OPTIMIZATION} is supported) {
095 *   INodeInfo of root, numberOfChildren of root: int
096 *   [list of INodeInfo of root's children],
097 *   [list of INodeDirectoryInfo of root's directory children]
098 * }
099 *
100 * FSDirectoryTree (if {@link Feature#FSIMAGE_NAME_OPTIMIZATION} not supported){
101 *   [list of INodeInfo of INodes in topological order]
102 * }
103 *
104 * INodeInfo {
105 *   {
106 *     localName: short + byte[]
107 *   } when {@link Feature#FSIMAGE_NAME_OPTIMIZATION} is supported
108 *   or
109 *   {
110 *     fullPath: byte[]
111 *   } when {@link Feature#FSIMAGE_NAME_OPTIMIZATION} is not supported
112 *   replicationFactor: short, modificationTime: long,
113 *   accessTime: long, preferredBlockSize: long,
114 *   numberOfBlocks: int (-1 for INodeDirectory, -2 for INodeSymLink),
115 *   {
116 *     nsQuota: long, dsQuota: long,
117 *     {
118 *       isINodeSnapshottable: byte,
119 *       isINodeWithSnapshot: byte (if isINodeSnapshottable is false)
120 *     } (when {@link Feature#SNAPSHOT} is supported),
121 *     fsPermission: short, PermissionStatus
122 *   } for INodeDirectory
123 *   or
124 *   {
125 *     symlinkString, fsPermission: short, PermissionStatus
126 *   } for INodeSymlink
127 *   or
128 *   {
129 *     [list of BlockInfo]
130 *     [list of FileDiff]
131 *     {
132 *       isINodeFileUnderConstructionSnapshot: byte,
133 *       {clientName: short + byte[], clientMachine: short + byte[]} (when
134 *       isINodeFileUnderConstructionSnapshot is true),
135 *     } (when {@link Feature#SNAPSHOT} is supported and writing snapshotINode),
136 *     fsPermission: short, PermissionStatus
137 *   } for INodeFile
138 * }
139 *
140 * INodeDirectoryInfo {
141 *   fullPath of the directory: short + byte[],
142 *   numberOfChildren: int, [list of INodeInfo of children INode],
143 *   {
144 *     numberOfSnapshots: int,
145 *     [list of Snapshot] (when NumberOfSnapshots is positive),
146 *     numberOfDirectoryDiffs: int,
147 *     [list of DirectoryDiff] (NumberOfDirectoryDiffs is positive),
148 *     number of children that are directories,
149 *     [list of INodeDirectoryInfo of the directory children] (includes
150 *     snapshot copies of deleted sub-directories)
151 *   } (when {@link Feature#SNAPSHOT} is supported),
152 * }
153 *
154 * Snapshot {
155 *   snapshotID: int, root of Snapshot: INodeDirectoryInfo (its local name is
156 *   the name of the snapshot)
157 * }
158 *
159 * DirectoryDiff {
160 *   full path of the root of the associated Snapshot: short + byte[],
161 *   childrenSize: int,
162 *   isSnapshotRoot: byte,
163 *   snapshotINodeIsNotNull: byte (when isSnapshotRoot is false),
164 *   snapshotINode: INodeDirectory (when SnapshotINodeIsNotNull is true), Diff
165 * }
166 *
167 * Diff {
168 *   createdListSize: int, [Local name of INode in created list],
169 *   deletedListSize: int, [INode in deleted list: INodeInfo]
170 * }
171 *
172 * FileDiff {
173 *   full path of the root of the associated Snapshot: short + byte[],
174 *   fileSize: long,
175 *   snapshotINodeIsNotNull: byte,
176 *   snapshotINode: INodeFile (when SnapshotINodeIsNotNull is true), Diff
177 * }
178 * </pre>
179 */
180@InterfaceAudience.Private
181@InterfaceStability.Evolving
182public class FSImageFormat {
183  private static final Log LOG = FSImage.LOG;
184
185  // Static-only class
186  private FSImageFormat() {}
187
188  interface AbstractLoader {
189    MD5Hash getLoadedImageMd5();
190    long getLoadedImageTxId();
191  }
192
193  static class LoaderDelegator implements AbstractLoader {
194    private AbstractLoader impl;
195    private final Configuration conf;
196    private final FSNamesystem fsn;
197
198    LoaderDelegator(Configuration conf, FSNamesystem fsn) {
199      this.conf = conf;
200      this.fsn = fsn;
201    }
202
203    @Override
204    public MD5Hash getLoadedImageMd5() {
205      return impl.getLoadedImageMd5();
206    }
207
208    @Override
209    public long getLoadedImageTxId() {
210      return impl.getLoadedImageTxId();
211    }
212
213    public void load(File file, boolean requireSameLayoutVersion)
214        throws IOException {
215      Preconditions.checkState(impl == null, "Image already loaded!");
216
217      FileInputStream is = null;
218      try {
219        is = new FileInputStream(file);
220        byte[] magic = new byte[FSImageUtil.MAGIC_HEADER.length];
221        IOUtils.readFully(is, magic, 0, magic.length);
222        if (Arrays.equals(magic, FSImageUtil.MAGIC_HEADER)) {
223          FSImageFormatProtobuf.Loader loader = new FSImageFormatProtobuf.Loader(
224              conf, fsn, requireSameLayoutVersion);
225          impl = loader;
226          loader.load(file);
227        } else {
228          Loader loader = new Loader(conf, fsn);
229          impl = loader;
230          loader.load(file);
231        }
232      } finally {
233        IOUtils.cleanup(LOG, is);
234      }
235    }
236  }
237
238  /**
239   * Construct a loader class to load the image. It chooses the loader based on
240   * the layout version.
241   */
242  public static LoaderDelegator newLoader(Configuration conf, FSNamesystem fsn) {
243    return new LoaderDelegator(conf, fsn);
244  }
245
246  /**
247   * A one-shot class responsible for loading an image. The load() function
248   * should be called once, after which the getter methods may be used to retrieve
249   * information about the image that was loaded, if loading was successful.
250   */
251  public static class Loader implements AbstractLoader {
252    private final Configuration conf;
253    /** which namesystem this loader is working for */
254    private final FSNamesystem namesystem;
255
256    /** Set to true once a file has been loaded using this loader. */
257    private boolean loaded = false;
258
259    /** The transaction ID of the last edit represented by the loaded file */
260    private long imgTxId;
261    /** The MD5 sum of the loaded file */
262    private MD5Hash imgDigest;
263    
264    private Map<Integer, Snapshot> snapshotMap = null;
265    private final ReferenceMap referenceMap = new ReferenceMap();
266
267    Loader(Configuration conf, FSNamesystem namesystem) {
268      this.conf = conf;
269      this.namesystem = namesystem;
270    }
271
272    /**
273     * Return the MD5 checksum of the image that has been loaded.
274     * @throws IllegalStateException if load() has not yet been called.
275     */
276    @Override
277    public MD5Hash getLoadedImageMd5() {
278      checkLoaded();
279      return imgDigest;
280    }
281
282    @Override
283    public long getLoadedImageTxId() {
284      checkLoaded();
285      return imgTxId;
286    }
287
288    /**
289     * Throw IllegalStateException if load() has not yet been called.
290     */
291    private void checkLoaded() {
292      if (!loaded) {
293        throw new IllegalStateException("Image not yet loaded!");
294      }
295    }
296
297    /**
298     * Throw IllegalStateException if load() has already been called.
299     */
300    private void checkNotLoaded() {
301      if (loaded) {
302        throw new IllegalStateException("Image already loaded!");
303      }
304    }
305
306    public void load(File curFile) throws IOException {
307      checkNotLoaded();
308      assert curFile != null : "curFile is null";
309
310      StartupProgress prog = NameNode.getStartupProgress();
311      Step step = new Step(StepType.INODES);
312      prog.beginStep(Phase.LOADING_FSIMAGE, step);
313      long startTime = monotonicNow();
314
315      //
316      // Load in bits
317      //
318      MessageDigest digester = MD5Hash.getDigester();
319      DigestInputStream fin = new DigestInputStream(
320           new FileInputStream(curFile), digester);
321
322      DataInputStream in = new DataInputStream(fin);
323      try {
324        // read image version: first appeared in version -1
325        int imgVersion = in.readInt();
326        if (getLayoutVersion() != imgVersion) {
327          throw new InconsistentFSStateException(curFile, 
328              "imgVersion " + imgVersion +
329              " expected to be " + getLayoutVersion());
330        }
331        boolean supportSnapshot = NameNodeLayoutVersion.supports(
332            LayoutVersion.Feature.SNAPSHOT, imgVersion);
333        if (NameNodeLayoutVersion.supports(
334            LayoutVersion.Feature.ADD_LAYOUT_FLAGS, imgVersion)) {
335          LayoutFlags.read(in);
336        }
337
338        // read namespaceID: first appeared in version -2
339        in.readInt();
340
341        long numFiles = in.readLong();
342
343        // read in the last generation stamp for legacy blocks.
344        long genstamp = in.readLong();
345        namesystem.getBlockIdManager().setGenerationStampV1(genstamp);
346
347        if (NameNodeLayoutVersion.supports(
348            LayoutVersion.Feature.SEQUENTIAL_BLOCK_ID, imgVersion)) {
349          // read the starting generation stamp for sequential block IDs
350          genstamp = in.readLong();
351          namesystem.getBlockIdManager().setGenerationStampV2(genstamp);
352
353          // read the last generation stamp for blocks created after
354          // the switch to sequential block IDs.
355          long stampAtIdSwitch = in.readLong();
356          namesystem.getBlockIdManager().setGenerationStampV1Limit(stampAtIdSwitch);
357
358          // read the max sequential block ID.
359          long maxSequentialBlockId = in.readLong();
360          namesystem.getBlockIdManager().setLastAllocatedBlockId(maxSequentialBlockId);
361        } else {
362
363          long startingGenStamp = namesystem.getBlockIdManager()
364            .upgradeGenerationStampToV2();
365          // This is an upgrade.
366          LOG.info("Upgrading to sequential block IDs. Generation stamp " +
367                   "for new blocks set to " + startingGenStamp);
368        }
369
370        // read the transaction ID of the last edit represented by
371        // this image
372        if (NameNodeLayoutVersion.supports(
373            LayoutVersion.Feature.STORED_TXIDS, imgVersion)) {
374          imgTxId = in.readLong();
375        } else {
376          imgTxId = 0;
377        }
378
379        // read the last allocated inode id in the fsimage
380        if (NameNodeLayoutVersion.supports(
381            LayoutVersion.Feature.ADD_INODE_ID, imgVersion)) {
382          long lastInodeId = in.readLong();
383          namesystem.dir.resetLastInodeId(lastInodeId);
384          if (LOG.isDebugEnabled()) {
385            LOG.debug("load last allocated InodeId from fsimage:" + lastInodeId);
386          }
387        } else {
388          if (LOG.isDebugEnabled()) {
389            LOG.debug("Old layout version doesn't have inode id."
390                + " Will assign new id for each inode.");
391          }
392        }
393        
394        if (supportSnapshot) {
395          snapshotMap = namesystem.getSnapshotManager().read(in, this);
396        }
397
398        // read compression related info
399        FSImageCompression compression;
400        if (NameNodeLayoutVersion.supports(
401            LayoutVersion.Feature.FSIMAGE_COMPRESSION, imgVersion)) {
402          compression = FSImageCompression.readCompressionHeader(conf, in);
403        } else {
404          compression = FSImageCompression.createNoopCompression();
405        }
406        in = compression.unwrapInputStream(fin);
407
408        LOG.info("Loading image file " + curFile + " using " + compression);
409        
410        // load all inodes
411        LOG.info("Number of files = " + numFiles);
412        prog.setTotal(Phase.LOADING_FSIMAGE, step, numFiles);
413        Counter counter = prog.getCounter(Phase.LOADING_FSIMAGE, step);
414        if (NameNodeLayoutVersion.supports(
415            LayoutVersion.Feature.FSIMAGE_NAME_OPTIMIZATION, imgVersion)) {
416          if (supportSnapshot) {
417            loadLocalNameINodesWithSnapshot(numFiles, in, counter);
418          } else {
419            loadLocalNameINodes(numFiles, in, counter);
420          }
421        } else {
422          loadFullNameINodes(numFiles, in, counter);
423        }
424
425        loadFilesUnderConstruction(in, supportSnapshot, counter);
426        prog.endStep(Phase.LOADING_FSIMAGE, step);
427        // Now that the step is finished, set counter equal to total to adjust
428        // for possible under-counting due to reference inodes.
429        prog.setCount(Phase.LOADING_FSIMAGE, step, numFiles);
430
431        loadSecretManagerState(in);
432
433        loadCacheManagerState(in);
434
435        // make sure to read to the end of file
436        boolean eof = (in.read() == -1);
437        assert eof : "Should have reached the end of image file " + curFile;
438      } finally {
439        in.close();
440      }
441
442      imgDigest = new MD5Hash(digester.digest());
443      loaded = true;
444      
445      LOG.info("Image file " + curFile + " of size " + curFile.length()
446          + " bytes loaded in " + (monotonicNow() - startTime) / 1000
447          + " seconds.");
448    }
449
450  /** Update the root node's attributes */
451  private void updateRootAttr(INodeWithAdditionalFields root) {                                                           
452    final QuotaCounts q = root.getQuotaCounts();
453    final long nsQuota = q.getNameSpace();
454    final long dsQuota = q.getStorageSpace();
455    FSDirectory fsDir = namesystem.dir;
456    if (nsQuota != -1 || dsQuota != -1) {
457      fsDir.rootDir.getDirectoryWithQuotaFeature().setQuota(nsQuota, dsQuota);
458    }
459    fsDir.rootDir.cloneModificationTime(root);
460    fsDir.rootDir.clonePermissionStatus(root);    
461  }
462  
463    /**
464     * Load fsimage files when 1) only local names are stored, 
465     * and 2) snapshot is supported.
466     * 
467     * @param numFiles number of files expected to be read
468     * @param in Image input stream
469     * @param counter Counter to increment for namenode startup progress
470     */
471    private void loadLocalNameINodesWithSnapshot(long numFiles, DataInput in,
472        Counter counter) throws IOException {
473      assert NameNodeLayoutVersion.supports(
474          LayoutVersion.Feature.FSIMAGE_NAME_OPTIMIZATION, getLayoutVersion());
475      assert NameNodeLayoutVersion.supports(
476          LayoutVersion.Feature.SNAPSHOT, getLayoutVersion());
477      
478      // load root
479      loadRoot(in, counter);
480      // load rest of the nodes recursively
481      loadDirectoryWithSnapshot(in, counter);
482    }
483    
484  /** 
485   * load fsimage files assuming only local names are stored. Used when
486   * snapshots are not supported by the layout version.
487   *   
488   * @param numFiles number of files expected to be read
489   * @param in image input stream
490   * @param counter Counter to increment for namenode startup progress
491   * @throws IOException
492   */  
493   private void loadLocalNameINodes(long numFiles, DataInput in, Counter counter)
494       throws IOException {
495     assert NameNodeLayoutVersion.supports(
496         LayoutVersion.Feature.FSIMAGE_NAME_OPTIMIZATION, getLayoutVersion());
497     assert numFiles > 0;
498
499     // load root
500     loadRoot(in, counter);
501     // have loaded the first file (the root)
502     numFiles--; 
503
504     // load rest of the nodes directory by directory
505     while (numFiles > 0) {
506       numFiles -= loadDirectory(in, counter);
507     }
508     if (numFiles != 0) {
509       throw new IOException("Read unexpect number of files: " + -numFiles);
510     }
511   }
512   
513    /**
514     * Load information about root, and use the information to update the root
515     * directory of NameSystem.
516     * @param in The {@link DataInput} instance to read.
517     * @param counter Counter to increment for namenode startup progress
518     */
519    private void loadRoot(DataInput in, Counter counter)
520        throws IOException {
521      // load root
522      if (in.readShort() != 0) {
523        throw new IOException("First node is not root");
524      }
525      final INodeDirectory root = loadINode(null, false, in, counter)
526        .asDirectory();
527      // update the root's attributes
528      updateRootAttr(root);
529    }
530   
531    /** Load children nodes for the parent directory. */
532    private int loadChildren(INodeDirectory parent, DataInput in,
533        Counter counter) throws IOException {
534      int numChildren = in.readInt();
535      for (int i = 0; i < numChildren; i++) {
536        // load single inode
537        INode newNode = loadINodeWithLocalName(false, in, true, counter);
538        addToParent(parent, newNode);
539      }
540      return numChildren;
541    }
542    
543    /**
544     * Load a directory when snapshot is supported.
545     * @param in The {@link DataInput} instance to read.
546     * @param counter Counter to increment for namenode startup progress
547     */
548    private void loadDirectoryWithSnapshot(DataInput in, Counter counter)
549        throws IOException {
550      // Step 1. Identify the parent INode
551      long inodeId = in.readLong();
552      final INodeDirectory parent = this.namesystem.dir.getInode(inodeId)
553          .asDirectory();
554      
555      // Check if the whole subtree has been saved (for reference nodes)
556      boolean toLoadSubtree = referenceMap.toProcessSubtree(parent.getId());
557      if (!toLoadSubtree) {
558        return;
559      }
560
561      // Step 2. Load snapshots if parent is snapshottable
562      int numSnapshots = in.readInt();
563      if (numSnapshots >= 0) {
564        // load snapshots and snapshotQuota
565        SnapshotFSImageFormat.loadSnapshotList(parent, numSnapshots, in, this);
566        if (parent.getDirectorySnapshottableFeature().getSnapshotQuota() > 0) {
567          // add the directory to the snapshottable directory list in 
568          // SnapshotManager. Note that we only add root when its snapshot quota
569          // is positive.
570          this.namesystem.getSnapshotManager().addSnapshottable(parent);
571        }
572      }
573
574      // Step 3. Load children nodes under parent
575      loadChildren(parent, in, counter);
576      
577      // Step 4. load Directory Diff List
578      SnapshotFSImageFormat.loadDirectoryDiffList(parent, in, this);
579      
580      // Recursively load sub-directories, including snapshot copies of deleted
581      // directories
582      int numSubTree = in.readInt();
583      for (int i = 0; i < numSubTree; i++) {
584        loadDirectoryWithSnapshot(in, counter);
585      }
586    }
587    
588   /**
589    * Load all children of a directory
590    * 
591    * @param in input to load from
592    * @param counter Counter to increment for namenode startup progress
593    * @return number of child inodes read
594    * @throws IOException
595    */
596   private int loadDirectory(DataInput in, Counter counter) throws IOException {
597     String parentPath = FSImageSerialization.readString(in);
598     // Rename .snapshot paths if we're doing an upgrade
599     parentPath = renameReservedPathsOnUpgrade(parentPath, getLayoutVersion());
600     final INodeDirectory parent = INodeDirectory.valueOf(
601         namesystem.dir.getINode(parentPath, true), parentPath);
602     return loadChildren(parent, in, counter);
603   }
604
605  /**
606   * load fsimage files assuming full path names are stored
607   * 
608   * @param numFiles total number of files to load
609   * @param in data input stream
610   * @param counter Counter to increment for namenode startup progress
611   * @throws IOException if any error occurs
612   */
613  private void loadFullNameINodes(long numFiles, DataInput in, Counter counter)
614      throws IOException {
615    byte[][] pathComponents;
616    byte[][] parentPath = {{}};      
617    FSDirectory fsDir = namesystem.dir;
618    INodeDirectory parentINode = fsDir.rootDir;
619    for (long i = 0; i < numFiles; i++) {
620      pathComponents = FSImageSerialization.readPathComponents(in);
621      for (int j=0; j < pathComponents.length; j++) {
622        byte[] newComponent = renameReservedComponentOnUpgrade
623            (pathComponents[j], getLayoutVersion());
624        if (!Arrays.equals(newComponent, pathComponents[j])) {
625          String oldPath = DFSUtil.byteArray2PathString(pathComponents);
626          pathComponents[j] = newComponent;
627          String newPath = DFSUtil.byteArray2PathString(pathComponents);
628          LOG.info("Renaming reserved path " + oldPath + " to " + newPath);
629        }
630      }
631      final INode newNode = loadINode(
632          pathComponents[pathComponents.length-1], false, in, counter);
633
634      if (isRoot(pathComponents)) { // it is the root
635        // update the root's attributes
636        updateRootAttr(newNode.asDirectory());
637        continue;
638      }
639
640      namesystem.dir.addToInodeMap(newNode);
641      // check if the new inode belongs to the same parent
642      if(!isParent(pathComponents, parentPath)) {
643        parentINode = getParentINodeDirectory(pathComponents);
644        parentPath = getParent(pathComponents);
645      }
646
647      // add new inode
648      addToParent(parentINode, newNode);
649    }
650  }
651
652  private INodeDirectory getParentINodeDirectory(byte[][] pathComponents
653      ) throws FileNotFoundException, PathIsNotDirectoryException,
654      UnresolvedLinkException {
655    if (pathComponents.length < 2) { // root
656      return null;
657    }
658    // Gets the parent INode
659    final INodesInPath inodes = namesystem.dir.getExistingPathINodes(
660        pathComponents);
661    return INodeDirectory.valueOf(inodes.getINode(-2), pathComponents);
662  }
663
664  /**
665   * Add the child node to parent and, if child is a file, update block map.
666   * This method is only used for image loading so that synchronization,
667   * modification time update and space count update are not needed.
668   */
669  private void addToParent(INodeDirectory parent, INode child)
670      throws IllegalReservedPathException {
671    FSDirectory fsDir = namesystem.dir;
672    if (parent == fsDir.rootDir) {
673        child.setLocalName(renameReservedRootComponentOnUpgrade(
674            child.getLocalNameBytes(), getLayoutVersion()));
675    }
676    // NOTE: This does not update space counts for parents
677    if (!parent.addChild(child)) {
678      return;
679    }
680    namesystem.dir.cacheName(child);
681
682    if (child.isFile()) {
683      updateBlocksMap(child.asFile());
684    }
685  }
686
687    public void updateBlocksMap(INodeFile file) {
688      // Add file->block mapping
689      final BlockInfoContiguous[] blocks = file.getBlocks();
690      if (blocks != null) {
691        final BlockManager bm = namesystem.getBlockManager();
692        for (int i = 0; i < blocks.length; i++) {
693          file.setBlock(i, bm.addBlockCollection(blocks[i], file));
694        } 
695      }
696    }
697
698    /** @return The FSDirectory of the namesystem where the fsimage is loaded */
699    public FSDirectory getFSDirectoryInLoading() {
700      return namesystem.dir;
701    }
702
703    public INode loadINodeWithLocalName(boolean isSnapshotINode, DataInput in,
704        boolean updateINodeMap) throws IOException {
705      return loadINodeWithLocalName(isSnapshotINode, in, updateINodeMap, null);
706    }
707
708    public INode loadINodeWithLocalName(boolean isSnapshotINode,
709        DataInput in, boolean updateINodeMap, Counter counter)
710        throws IOException {
711      byte[] localName = FSImageSerialization.readLocalName(in);
712      localName =
713          renameReservedComponentOnUpgrade(localName, getLayoutVersion());
714      INode inode = loadINode(localName, isSnapshotINode, in, counter);
715      if (updateINodeMap) {
716        namesystem.dir.addToInodeMap(inode);
717      }
718      return inode;
719    }
720  
721  /**
722   * load an inode from fsimage except for its name
723   * 
724   * @param in data input stream from which image is read
725   * @param counter Counter to increment for namenode startup progress
726   * @return an inode
727   */
728  @SuppressWarnings("deprecation")
729  INode loadINode(final byte[] localName, boolean isSnapshotINode,
730      DataInput in, Counter counter) throws IOException {
731    final int imgVersion = getLayoutVersion();
732    if (NameNodeLayoutVersion.supports(
733        LayoutVersion.Feature.SNAPSHOT, imgVersion)) {
734      namesystem.getFSDirectory().verifyINodeName(localName);
735    }
736
737    long inodeId = NameNodeLayoutVersion.supports(
738        LayoutVersion.Feature.ADD_INODE_ID, imgVersion) ? in.readLong()
739        : namesystem.dir.allocateNewInodeId();
740    
741    final short replication = namesystem.getBlockManager().adjustReplication(
742        in.readShort());
743    final long modificationTime = in.readLong();
744    long atime = 0;
745    if (NameNodeLayoutVersion.supports(
746        LayoutVersion.Feature.FILE_ACCESS_TIME, imgVersion)) {
747      atime = in.readLong();
748    }
749    final long blockSize = in.readLong();
750    final int numBlocks = in.readInt();
751
752    if (numBlocks >= 0) {
753      // file
754      
755      // read blocks
756      BlockInfoContiguous[] blocks = new BlockInfoContiguous[numBlocks];
757      for (int j = 0; j < numBlocks; j++) {
758        blocks[j] = new BlockInfoContiguous(replication);
759        blocks[j].readFields(in);
760      }
761
762      String clientName = "";
763      String clientMachine = "";
764      boolean underConstruction = false;
765      FileDiffList fileDiffs = null;
766      if (NameNodeLayoutVersion.supports(
767          LayoutVersion.Feature.SNAPSHOT, imgVersion)) {
768        // read diffs
769        fileDiffs = SnapshotFSImageFormat.loadFileDiffList(in, this);
770
771        if (isSnapshotINode) {
772          underConstruction = in.readBoolean();
773          if (underConstruction) {
774            clientName = FSImageSerialization.readString(in);
775            clientMachine = FSImageSerialization.readString(in);
776            // convert the last block to BlockUC
777            if (blocks.length > 0) {
778              BlockInfoContiguous lastBlk = blocks[blocks.length - 1];
779              blocks[blocks.length - 1] = new BlockInfoContiguousUnderConstruction(
780                  lastBlk, replication);
781            }
782          }
783        }
784      }
785
786      final PermissionStatus permissions = PermissionStatus.read(in);
787
788      // return
789      if (counter != null) {
790        counter.increment();
791      }
792
793      final INodeFile file = new INodeFile(inodeId, localName, permissions,
794          modificationTime, atime, blocks, replication, blockSize, (byte)0);
795      if (underConstruction) {
796        file.toUnderConstruction(clientName, clientMachine);
797      }
798        return fileDiffs == null ? file : new INodeFile(file, fileDiffs);
799      } else if (numBlocks == -1) {
800        //directory
801      
802      //read quotas
803      final long nsQuota = in.readLong();
804      long dsQuota = -1L;
805      if (NameNodeLayoutVersion.supports(
806          LayoutVersion.Feature.DISKSPACE_QUOTA, imgVersion)) {
807        dsQuota = in.readLong();
808      }
809
810      //read snapshot info
811      boolean snapshottable = false;
812      boolean withSnapshot = false;
813      if (NameNodeLayoutVersion.supports(
814          LayoutVersion.Feature.SNAPSHOT, imgVersion)) {
815        snapshottable = in.readBoolean();
816        if (!snapshottable) {
817          withSnapshot = in.readBoolean();
818        }
819      }
820
821      final PermissionStatus permissions = PermissionStatus.read(in);
822
823      //return
824      if (counter != null) {
825        counter.increment();
826      }
827      final INodeDirectory dir = new INodeDirectory(inodeId, localName,
828          permissions, modificationTime);
829      if (nsQuota >= 0 || dsQuota >= 0) {
830        dir.addDirectoryWithQuotaFeature(new DirectoryWithQuotaFeature.Builder().
831            nameSpaceQuota(nsQuota).storageSpaceQuota(dsQuota).build());
832      }
833      if (withSnapshot) {
834        dir.addSnapshotFeature(null);
835      }
836      if (snapshottable) {
837        dir.addSnapshottableFeature();
838      }
839      return dir;
840    } else if (numBlocks == -2) {
841      //symlink
842      if (!FileSystem.areSymlinksEnabled()) {
843        throw new IOException("Symlinks not supported - please remove symlink before upgrading to this version of HDFS");
844      }
845
846      final String symlink = Text.readString(in);
847      final PermissionStatus permissions = PermissionStatus.read(in);
848      if (counter != null) {
849        counter.increment();
850      }
851      return new INodeSymlink(inodeId, localName, permissions,
852          modificationTime, atime, symlink);
853    } else if (numBlocks == -3) {
854      //reference
855      // Intentionally do not increment counter, because it is too difficult at
856      // this point to assess whether or not this is a reference that counts
857      // toward quota.
858      
859      final boolean isWithName = in.readBoolean();
860      // lastSnapshotId for WithName node, dstSnapshotId for DstReference node
861      int snapshotId = in.readInt();
862      
863      final INodeReference.WithCount withCount
864          = referenceMap.loadINodeReferenceWithCount(isSnapshotINode, in, this);
865
866      if (isWithName) {
867          return new INodeReference.WithName(null, withCount, localName,
868              snapshotId);
869      } else {
870        final INodeReference ref = new INodeReference.DstReference(null,
871            withCount, snapshotId);
872        return ref;
873      }
874    }
875    
876    throw new IOException("Unknown inode type: numBlocks=" + numBlocks);
877  }
878
879    /** Load {@link INodeFileAttributes}. */
880    public INodeFileAttributes loadINodeFileAttributes(DataInput in)
881        throws IOException {
882      final int layoutVersion = getLayoutVersion();
883      
884      if (!NameNodeLayoutVersion.supports(
885          LayoutVersion.Feature.OPTIMIZE_SNAPSHOT_INODES, layoutVersion)) {
886        return loadINodeWithLocalName(true, in, false).asFile();
887      }
888  
889      final byte[] name = FSImageSerialization.readLocalName(in);
890      final PermissionStatus permissions = PermissionStatus.read(in);
891      final long modificationTime = in.readLong();
892      final long accessTime = in.readLong();
893  
894      final short replication = namesystem.getBlockManager().adjustReplication(
895          in.readShort());
896      final long preferredBlockSize = in.readLong();
897
898      return new INodeFileAttributes.SnapshotCopy(name, permissions, null, modificationTime,
899          accessTime, replication, preferredBlockSize, (byte) 0, null);
900    }
901
902    public INodeDirectoryAttributes loadINodeDirectoryAttributes(DataInput in)
903        throws IOException {
904      final int layoutVersion = getLayoutVersion();
905      
906      if (!NameNodeLayoutVersion.supports(
907          LayoutVersion.Feature.OPTIMIZE_SNAPSHOT_INODES, layoutVersion)) {
908        return loadINodeWithLocalName(true, in, false).asDirectory();
909      }
910  
911      final byte[] name = FSImageSerialization.readLocalName(in);
912      final PermissionStatus permissions = PermissionStatus.read(in);
913      final long modificationTime = in.readLong();
914      
915      // Read quotas: quota by storage type does not need to be processed below.
916      // It is handled only in protobuf based FsImagePBINode class for newer
917      // fsImages. Tools using this class such as legacy-mode of offline image viewer
918      // should only load legacy FSImages without newer features.
919      final long nsQuota = in.readLong();
920      final long dsQuota = in.readLong();
921
922      return nsQuota == -1L && dsQuota == -1L ? new INodeDirectoryAttributes.SnapshotCopy(
923          name, permissions, null, modificationTime, null)
924        : new INodeDirectoryAttributes.CopyWithQuota(name, permissions,
925            null, modificationTime, nsQuota, dsQuota, null, null);
926    }
927  
928    private void loadFilesUnderConstruction(DataInput in,
929        boolean supportSnapshot, Counter counter) throws IOException {
930      FSDirectory fsDir = namesystem.dir;
931      int size = in.readInt();
932
933      LOG.info("Number of files under construction = " + size);
934
935      for (int i = 0; i < size; i++) {
936        INodeFile cons = FSImageSerialization.readINodeUnderConstruction(in,
937            namesystem, getLayoutVersion());
938        counter.increment();
939
940        // verify that file exists in namespace
941        String path = cons.getLocalName();
942        INodeFile oldnode = null;
943        boolean inSnapshot = false;
944        if (path != null && FSDirectory.isReservedName(path) && 
945            NameNodeLayoutVersion.supports(
946                LayoutVersion.Feature.ADD_INODE_ID, getLayoutVersion())) {
947          // TODO: for HDFS-5428, we use reserved path for those INodeFileUC in
948          // snapshot. If we support INode ID in the layout version, we can use
949          // the inode id to find the oldnode.
950          oldnode = namesystem.dir.getInode(cons.getId()).asFile();
951          inSnapshot = true;
952        } else {
953          path = renameReservedPathsOnUpgrade(path, getLayoutVersion());
954          final INodesInPath iip = fsDir.getINodesInPath(path, true);
955          oldnode = INodeFile.valueOf(iip.getLastINode(), path);
956        }
957
958        FileUnderConstructionFeature uc = cons.getFileUnderConstructionFeature();
959        oldnode.toUnderConstruction(uc.getClientName(), uc.getClientMachine());
960        if (oldnode.numBlocks() > 0) {
961          BlockInfoContiguous ucBlock = cons.getLastBlock();
962          // we do not replace the inode, just replace the last block of oldnode
963          BlockInfoContiguous info = namesystem.getBlockManager().addBlockCollection(
964              ucBlock, oldnode);
965          oldnode.setBlock(oldnode.numBlocks() - 1, info);
966        }
967
968        if (!inSnapshot) {
969          namesystem.leaseManager.addLease(cons
970              .getFileUnderConstructionFeature().getClientName(), path);
971        }
972      }
973    }
974
975    private void loadSecretManagerState(DataInput in)
976        throws IOException {
977      int imgVersion = getLayoutVersion();
978
979      if (!NameNodeLayoutVersion.supports(
980          LayoutVersion.Feature.DELEGATION_TOKEN, imgVersion)) {
981        //SecretManagerState is not available.
982        //This must not happen if security is turned on.
983        return; 
984      }
985      namesystem.loadSecretManagerStateCompat(in);
986    }
987
988    private void loadCacheManagerState(DataInput in) throws IOException {
989      int imgVersion = getLayoutVersion();
990      if (!NameNodeLayoutVersion.supports(
991          LayoutVersion.Feature.CACHING, imgVersion)) {
992        return;
993      }
994      namesystem.getCacheManager().loadStateCompat(in);
995    }
996
997    private int getLayoutVersion() {
998      return namesystem.getFSImage().getStorage().getLayoutVersion();
999    }
1000
1001    private boolean isRoot(byte[][] path) {
1002      return path.length == 1 &&
1003        path[0] == null;    
1004    }
1005
1006    private boolean isParent(byte[][] path, byte[][] parent) {
1007      if (path == null || parent == null)
1008        return false;
1009      if (parent.length == 0 || path.length != parent.length + 1)
1010        return false;
1011      boolean isParent = true;
1012      for (int i = 0; i < parent.length; i++) {
1013        isParent = isParent && Arrays.equals(path[i], parent[i]); 
1014      }
1015      return isParent;
1016    }
1017
1018    /**
1019     * Return string representing the parent of the given path.
1020     */
1021    String getParent(String path) {
1022      return path.substring(0, path.lastIndexOf(Path.SEPARATOR));
1023    }
1024    
1025    byte[][] getParent(byte[][] path) {
1026      byte[][] result = new byte[path.length - 1][];
1027      for (int i = 0; i < result.length; i++) {
1028        result[i] = new byte[path[i].length];
1029        System.arraycopy(path[i], 0, result[i], 0, path[i].length);
1030      }
1031      return result;
1032    }
1033    
1034    public Snapshot getSnapshot(DataInput in) throws IOException {
1035      return snapshotMap.get(in.readInt());
1036    }
1037  }
1038
1039  @VisibleForTesting
1040  public static final TreeMap<String, String> renameReservedMap =
1041      new TreeMap<String, String>();
1042
1043  /**
1044   * Use the default key-value pairs that will be used to determine how to
1045   * rename reserved paths on upgrade.
1046   */
1047  @VisibleForTesting
1048  public static void useDefaultRenameReservedPairs() {
1049    renameReservedMap.clear();
1050    for (String key: HdfsConstants.RESERVED_PATH_COMPONENTS) {
1051      renameReservedMap.put(
1052          key,
1053          key + "." + HdfsConstants.NAMENODE_LAYOUT_VERSION + "."
1054              + "UPGRADE_RENAMED");
1055    }
1056  }
1057
1058  /**
1059   * Set the key-value pairs that will be used to determine how to rename
1060   * reserved paths on upgrade.
1061   */
1062  @VisibleForTesting
1063  public static void setRenameReservedPairs(String renameReserved) {
1064    // Clear and set the default values
1065    useDefaultRenameReservedPairs();
1066    // Overwrite with provided values
1067    setRenameReservedMapInternal(renameReserved);
1068  }
1069
1070  private static void setRenameReservedMapInternal(String renameReserved) {
1071    Collection<String> pairs =
1072        StringUtils.getTrimmedStringCollection(renameReserved);
1073    for (String p : pairs) {
1074      String[] pair = StringUtils.split(p, '/', '=');
1075      Preconditions.checkArgument(pair.length == 2,
1076          "Could not parse key-value pair " + p);
1077      String key = pair[0];
1078      String value = pair[1];
1079      Preconditions.checkArgument(DFSUtil.isReservedPathComponent(key),
1080          "Unknown reserved path " + key);
1081      Preconditions.checkArgument(DFSUtil.isValidNameForComponent(value),
1082          "Invalid rename path for " + key + ": " + value);
1083      LOG.info("Will rename reserved path " + key + " to " + value);
1084      renameReservedMap.put(key, value);
1085    }
1086  }
1087
1088  /**
1089   * When upgrading from an old version, the filesystem could contain paths
1090   * that are now reserved in the new version (e.g. .snapshot). This renames
1091   * these new reserved paths to a user-specified value to avoid collisions
1092   * with the reserved name.
1093   * 
1094   * @param path Old path potentially containing a reserved path
1095   * @return New path with reserved path components renamed to user value
1096   */
1097  static String renameReservedPathsOnUpgrade(String path,
1098      final int layoutVersion) throws IllegalReservedPathException {
1099    final String oldPath = path;
1100    // If any known LVs aren't supported, we're doing an upgrade
1101    if (!NameNodeLayoutVersion.supports(Feature.ADD_INODE_ID, layoutVersion)) {
1102      String[] components = INode.getPathNames(path);
1103      // Only need to worry about the root directory
1104      if (components.length > 1) {
1105        components[1] = DFSUtil.bytes2String(
1106            renameReservedRootComponentOnUpgrade(
1107                DFSUtil.string2Bytes(components[1]),
1108                layoutVersion));
1109        path = DFSUtil.strings2PathString(components);
1110      }
1111    }
1112    if (!NameNodeLayoutVersion.supports(Feature.SNAPSHOT, layoutVersion)) {
1113      String[] components = INode.getPathNames(path);
1114      // Special case the root path
1115      if (components.length == 0) {
1116        return path;
1117      }
1118      for (int i=0; i<components.length; i++) {
1119        components[i] = DFSUtil.bytes2String(
1120            renameReservedComponentOnUpgrade(
1121                DFSUtil.string2Bytes(components[i]),
1122                layoutVersion));
1123      }
1124      path = DFSUtil.strings2PathString(components);
1125    }
1126
1127    if (!path.equals(oldPath)) {
1128      LOG.info("Upgrade process renamed reserved path " + oldPath + " to "
1129          + path);
1130    }
1131    return path;
1132  }
1133
1134  private final static String RESERVED_ERROR_MSG = 
1135      FSDirectory.DOT_RESERVED_PATH_PREFIX + " is a reserved path and "
1136      + HdfsConstants.DOT_SNAPSHOT_DIR + " is a reserved path component in"
1137      + " this version of HDFS. Please rollback and delete or rename"
1138      + " this path, or upgrade with the "
1139      + StartupOption.RENAMERESERVED.getName()
1140      + " [key-value pairs]"
1141      + " option to automatically rename these paths during upgrade.";
1142
1143  /**
1144   * Same as {@link #renameReservedPathsOnUpgrade(String)}, but for a single
1145   * byte array path component.
1146   */
1147  private static byte[] renameReservedComponentOnUpgrade(byte[] component,
1148      final int layoutVersion) throws IllegalReservedPathException {
1149    // If the LV doesn't support snapshots, we're doing an upgrade
1150    if (!NameNodeLayoutVersion.supports(Feature.SNAPSHOT, layoutVersion)) {
1151      if (Arrays.equals(component, HdfsConstants.DOT_SNAPSHOT_DIR_BYTES)) {
1152        if (!renameReservedMap.containsKey(HdfsConstants.DOT_SNAPSHOT_DIR)) {
1153          throw new IllegalReservedPathException(RESERVED_ERROR_MSG);
1154        }
1155        component =
1156            DFSUtil.string2Bytes(renameReservedMap
1157                .get(HdfsConstants.DOT_SNAPSHOT_DIR));
1158      }
1159    }
1160    return component;
1161  }
1162
1163  /**
1164   * Same as {@link #renameReservedPathsOnUpgrade(String)}, but for a single
1165   * byte array path component.
1166   */
1167  private static byte[] renameReservedRootComponentOnUpgrade(byte[] component,
1168      final int layoutVersion) throws IllegalReservedPathException {
1169    // If the LV doesn't support inode IDs, we're doing an upgrade
1170    if (!NameNodeLayoutVersion.supports(Feature.ADD_INODE_ID, layoutVersion)) {
1171      if (Arrays.equals(component, FSDirectory.DOT_RESERVED)) {
1172        if (!renameReservedMap.containsKey(HdfsConstants.DOT_SNAPSHOT_DIR)) {
1173          throw new IllegalReservedPathException(RESERVED_ERROR_MSG);
1174        }
1175        final String renameString = renameReservedMap
1176            .get(FSDirectory.DOT_RESERVED_STRING);
1177        component =
1178            DFSUtil.string2Bytes(renameString);
1179        LOG.info("Renamed root path " + FSDirectory.DOT_RESERVED_STRING
1180            + " to " + renameString);
1181      }
1182    }
1183    return component;
1184  }
1185
1186  /**
1187   * A one-shot class responsible for writing an image file.
1188   * The write() function should be called once, after which the getter
1189   * functions may be used to retrieve information about the file that was written.
1190   *
1191   * This is replaced by the PB-based FSImage. The class is to maintain
1192   * compatibility for the external fsimage tool.
1193   */
1194  @Deprecated
1195  static class Saver {
1196    private static final int LAYOUT_VERSION = -51;
1197    public static final int CHECK_CANCEL_INTERVAL = 4096;
1198    private final SaveNamespaceContext context;
1199    /** Set to true once an image has been written */
1200    private boolean saved = false;
1201    private long checkCancelCounter = 0;
1202
1203    /** The MD5 checksum of the file that was written */
1204    private MD5Hash savedDigest;
1205    private final ReferenceMap referenceMap = new ReferenceMap();
1206
1207    private final Map<Long, INodeFile> snapshotUCMap =
1208        new HashMap<Long, INodeFile>();
1209
1210    /** @throws IllegalStateException if the instance has not yet saved an image */
1211    private void checkSaved() {
1212      if (!saved) {
1213        throw new IllegalStateException("FSImageSaver has not saved an image");
1214      }
1215    }
1216
1217    /** @throws IllegalStateException if the instance has already saved an image */
1218    private void checkNotSaved() {
1219      if (saved) {
1220        throw new IllegalStateException("FSImageSaver has already saved an image");
1221      }
1222    }
1223
1224
1225    Saver(SaveNamespaceContext context) {
1226      this.context = context;
1227    }
1228
1229    /**
1230     * Return the MD5 checksum of the image file that was saved.
1231     */
1232    MD5Hash getSavedDigest() {
1233      checkSaved();
1234      return savedDigest;
1235    }
1236
1237    void save(File newFile, FSImageCompression compression) throws IOException {
1238      checkNotSaved();
1239
1240      final FSNamesystem sourceNamesystem = context.getSourceNamesystem();
1241      final INodeDirectory rootDir = sourceNamesystem.dir.rootDir;
1242      final long numINodes = rootDir.getDirectoryWithQuotaFeature()
1243          .getSpaceConsumed().getNameSpace();
1244      String sdPath = newFile.getParentFile().getParentFile().getAbsolutePath();
1245      Step step = new Step(StepType.INODES, sdPath);
1246      StartupProgress prog = NameNode.getStartupProgress();
1247      prog.beginStep(Phase.SAVING_CHECKPOINT, step);
1248      prog.setTotal(Phase.SAVING_CHECKPOINT, step, numINodes);
1249      Counter counter = prog.getCounter(Phase.SAVING_CHECKPOINT, step);
1250      long startTime = monotonicNow();
1251      //
1252      // Write out data
1253      //
1254      MessageDigest digester = MD5Hash.getDigester();
1255      FileOutputStream fout = new FileOutputStream(newFile);
1256      DigestOutputStream fos = new DigestOutputStream(fout, digester);
1257      DataOutputStream out = new DataOutputStream(fos);
1258      try {
1259        out.writeInt(LAYOUT_VERSION);
1260        LayoutFlags.write(out);
1261        // We use the non-locked version of getNamespaceInfo here since
1262        // the coordinating thread of saveNamespace already has read-locked
1263        // the namespace for us. If we attempt to take another readlock
1264        // from the actual saver thread, there's a potential of a
1265        // fairness-related deadlock. See the comments on HDFS-2223.
1266        out.writeInt(sourceNamesystem.unprotectedGetNamespaceInfo()
1267            .getNamespaceID());
1268        out.writeLong(numINodes);
1269        out.writeLong(sourceNamesystem.getBlockIdManager().getGenerationStampV1());
1270        out.writeLong(sourceNamesystem.getBlockIdManager().getGenerationStampV2());
1271        out.writeLong(sourceNamesystem.getBlockIdManager().getGenerationStampAtblockIdSwitch());
1272        out.writeLong(sourceNamesystem.getBlockIdManager().getLastAllocatedBlockId());
1273        out.writeLong(context.getTxId());
1274        out.writeLong(sourceNamesystem.dir.getLastInodeId());
1275
1276
1277        sourceNamesystem.getSnapshotManager().write(out);
1278
1279        // write compression info and set up compressed stream
1280        out = compression.writeHeaderAndWrapStream(fos);
1281        LOG.info("Saving image file " + newFile +
1282                 " using " + compression);
1283
1284        // save the root
1285        saveINode2Image(rootDir, out, false, referenceMap, counter);
1286        // save the rest of the nodes
1287        saveImage(rootDir, out, true, false, counter);
1288        prog.endStep(Phase.SAVING_CHECKPOINT, step);
1289        // Now that the step is finished, set counter equal to total to adjust
1290        // for possible under-counting due to reference inodes.
1291        prog.setCount(Phase.SAVING_CHECKPOINT, step, numINodes);
1292        // save files under construction
1293        // TODO: for HDFS-5428, since we cannot break the compatibility of
1294        // fsimage, we store part of the under-construction files that are only
1295        // in snapshots in this "under-construction-file" section. As a
1296        // temporary solution, we use "/.reserved/.inodes/<inodeid>" as their
1297        // paths, so that when loading fsimage we do not put them into the lease
1298        // map. In the future, we can remove this hack when we can bump the
1299        // layout version.
1300        sourceNamesystem.saveFilesUnderConstruction(out, snapshotUCMap);
1301
1302        context.checkCancelled();
1303        sourceNamesystem.saveSecretManagerStateCompat(out, sdPath);
1304        context.checkCancelled();
1305        sourceNamesystem.getCacheManager().saveStateCompat(out, sdPath);
1306        context.checkCancelled();
1307        out.flush();
1308        context.checkCancelled();
1309        fout.getChannel().force(true);
1310      } finally {
1311        out.close();
1312      }
1313
1314      saved = true;
1315      // set md5 of the saved image
1316      savedDigest = new MD5Hash(digester.digest());
1317
1318      LOG.info("Image file " + newFile + " of size " + newFile.length()
1319          + " bytes saved in " + (monotonicNow() - startTime) / 1000
1320          + " seconds.");
1321    }
1322
1323    /**
1324     * Save children INodes.
1325     * @param children The list of children INodes
1326     * @param out The DataOutputStream to write
1327     * @param inSnapshot Whether the parent directory or its ancestor is in
1328     *                   the deleted list of some snapshot (caused by rename or
1329     *                   deletion)
1330     * @param counter Counter to increment for namenode startup progress
1331     * @return Number of children that are directory
1332     */
1333    private int saveChildren(ReadOnlyList<INode> children,
1334        DataOutputStream out, boolean inSnapshot, Counter counter)
1335        throws IOException {
1336      // Write normal children INode.
1337      out.writeInt(children.size());
1338      int dirNum = 0;
1339      for(INode child : children) {
1340        // print all children first
1341        // TODO: for HDFS-5428, we cannot change the format/content of fsimage
1342        // here, thus even if the parent directory is in snapshot, we still
1343        // do not handle INodeUC as those stored in deleted list
1344        saveINode2Image(child, out, false, referenceMap, counter);
1345        if (child.isDirectory()) {
1346          dirNum++;
1347        } else if (inSnapshot && child.isFile()
1348            && child.asFile().isUnderConstruction()) {
1349          this.snapshotUCMap.put(child.getId(), child.asFile());
1350        }
1351        if (checkCancelCounter++ % CHECK_CANCEL_INTERVAL == 0) {
1352          context.checkCancelled();
1353        }
1354      }
1355      return dirNum;
1356    }
1357
1358    /**
1359     * Save file tree image starting from the given root.
1360     * This is a recursive procedure, which first saves all children and
1361     * snapshot diffs of a current directory and then moves inside the
1362     * sub-directories.
1363     *
1364     * @param current The current node
1365     * @param out The DataoutputStream to write the image
1366     * @param toSaveSubtree Whether or not to save the subtree to fsimage. For
1367     *                      reference node, its subtree may already have been
1368     *                      saved before.
1369     * @param inSnapshot Whether the current directory is in snapshot
1370     * @param counter Counter to increment for namenode startup progress
1371     */
1372    private void saveImage(INodeDirectory current, DataOutputStream out,
1373        boolean toSaveSubtree, boolean inSnapshot, Counter counter)
1374        throws IOException {
1375      // write the inode id of the directory
1376      out.writeLong(current.getId());
1377
1378      if (!toSaveSubtree) {
1379        return;
1380      }
1381
1382      final ReadOnlyList<INode> children = current
1383          .getChildrenList(Snapshot.CURRENT_STATE_ID);
1384      int dirNum = 0;
1385      List<INodeDirectory> snapshotDirs = null;
1386      DirectoryWithSnapshotFeature sf = current.getDirectoryWithSnapshotFeature();
1387      if (sf != null) {
1388        snapshotDirs = new ArrayList<INodeDirectory>();
1389        sf.getSnapshotDirectory(snapshotDirs);
1390        dirNum += snapshotDirs.size();
1391      }
1392
1393      // 2. Write INodeDirectorySnapshottable#snapshotsByNames to record all
1394      // Snapshots
1395      if (current.isDirectory() && current.asDirectory().isSnapshottable()) {
1396        SnapshotFSImageFormat.saveSnapshots(current.asDirectory(), out);
1397      } else {
1398        out.writeInt(-1); // # of snapshots
1399      }
1400
1401      // 3. Write children INode
1402      dirNum += saveChildren(children, out, inSnapshot, counter);
1403
1404      // 4. Write DirectoryDiff lists, if there is any.
1405      SnapshotFSImageFormat.saveDirectoryDiffList(current, out, referenceMap);
1406
1407      // Write sub-tree of sub-directories, including possible snapshots of
1408      // deleted sub-directories
1409      out.writeInt(dirNum); // the number of sub-directories
1410      for(INode child : children) {
1411        if(!child.isDirectory()) {
1412          continue;
1413        }
1414        // make sure we only save the subtree under a reference node once
1415        boolean toSave = child.isReference() ?
1416            referenceMap.toProcessSubtree(child.getId()) : true;
1417        saveImage(child.asDirectory(), out, toSave, inSnapshot, counter);
1418      }
1419      if (snapshotDirs != null) {
1420        for (INodeDirectory subDir : snapshotDirs) {
1421          // make sure we only save the subtree under a reference node once
1422          boolean toSave = subDir.getParentReference() != null ?
1423              referenceMap.toProcessSubtree(subDir.getId()) : true;
1424          saveImage(subDir, out, toSave, true, counter);
1425        }
1426      }
1427    }
1428
1429    /**
1430     * Saves inode and increments progress counter.
1431     *
1432     * @param inode INode to save
1433     * @param out DataOutputStream to receive inode
1434     * @param writeUnderConstruction boolean true if this is under construction
1435     * @param referenceMap ReferenceMap containing reference inodes
1436     * @param counter Counter to increment for namenode startup progress
1437     * @throws IOException thrown if there is an I/O error
1438     */
1439    private void saveINode2Image(INode inode, DataOutputStream out,
1440        boolean writeUnderConstruction, ReferenceMap referenceMap,
1441        Counter counter) throws IOException {
1442      FSImageSerialization.saveINode2Image(inode, out, writeUnderConstruction,
1443        referenceMap);
1444      // Intentionally do not increment counter for reference inodes, because it
1445      // is too difficult at this point to assess whether or not this is a
1446      // reference that counts toward quota.
1447      if (!(inode instanceof INodeReference)) {
1448        counter.increment();
1449      }
1450    }
1451  }
1452}