001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs.server.namenode;
019
020import static org.apache.hadoop.crypto.key.KeyProviderCryptoExtension.EncryptedKeyVersion;
021import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_TRASH_INTERVAL_DEFAULT;
022import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_TRASH_INTERVAL_KEY;
023import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_FILE_BUFFER_SIZE_DEFAULT;
024import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_FILE_BUFFER_SIZE_KEY;
025import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BLOCK_SIZE_DEFAULT;
026import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BLOCK_SIZE_KEY;
027import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BYTES_PER_CHECKSUM_DEFAULT;
028import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BYTES_PER_CHECKSUM_KEY;
029import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CHECKSUM_TYPE_DEFAULT;
030import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CHECKSUM_TYPE_KEY;
031import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_WRITE_PACKET_SIZE_DEFAULT;
032import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_WRITE_PACKET_SIZE_KEY;
033import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_ENCRYPT_DATA_TRANSFER_DEFAULT;
034import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_ENCRYPT_DATA_TRANSFER_KEY;
035import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_STANDBY_CHECKPOINTS_DEFAULT;
036import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_STANDBY_CHECKPOINTS_KEY;
037import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ACCESSTIME_PRECISION_DEFAULT;
038import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ACCESSTIME_PRECISION_KEY;
039import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOGGERS_KEY;
040import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_ASYNC_DEFAULT;
041import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_ASYNC_KEY;
042import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_DEFAULT;
043import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_KEY;
044import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_DEFAULT;
045import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_KEY;
046import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DEFAULT_AUDIT_LOGGER_NAME;
047import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_DEFAULT;
048import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_KEY;
049import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_DEFAULT;
050import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY;
051import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_DEFAULT;
052import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_KEY;
053import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_DEFAULT;
054import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_KEY;
055import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY;
056import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY;
057import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDIT_LOG_AUTOROLL_CHECK_INTERVAL_MS;
058import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDIT_LOG_AUTOROLL_CHECK_INTERVAL_MS_DEFAULT;
059import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDIT_LOG_AUTOROLL_MULTIPLIER_THRESHOLD;
060import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDIT_LOG_AUTOROLL_MULTIPLIER_THRESHOLD_DEFAULT;
061import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ENABLE_RETRY_CACHE_DEFAULT;
062import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ENABLE_RETRY_CACHE_KEY;
063import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_LAZY_PERSIST_FILE_SCRUB_INTERVAL_SEC;
064import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_LAZY_PERSIST_FILE_SCRUB_INTERVAL_SEC_DEFAULT;
065import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_INODE_ATTRIBUTES_PROVIDER_KEY;
066import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_MAX_OBJECTS_DEFAULT;
067import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_MAX_OBJECTS_KEY;
068import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_NAME_DIR_KEY;
069import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPLICATION_MIN_DEFAULT;
070import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPLICATION_MIN_KEY;
071import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPL_QUEUE_THRESHOLD_PCT_KEY;
072import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_DEFAULT;
073import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_KEY;
074import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_DEFAULT;
075import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_KEY;
076import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_DEFAULT;
077import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_KEY;
078import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_EXTENSION_KEY;
079import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_MIN_DATANODES_DEFAULT;
080import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY;
081import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_DEFAULT;
082import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_KEY;
083import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SHARED_EDITS_DIR_KEY;
084import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_ENABLED_DEFAULT;
085import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_ENABLED_KEY;
086import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_SUPERUSERGROUP_DEFAULT;
087import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_SUPERUSERGROUP_KEY;
088import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_REPLICATION_DEFAULT;
089import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_REPLICATION_KEY;
090import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_SUPPORT_APPEND_DEFAULT;
091import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_SUPPORT_APPEND_KEY;
092import static org.apache.hadoop.hdfs.server.common.HdfsServerConstants.SECURITY_XATTR_UNREADABLE_BY_SUPERUSER;
093import static org.apache.hadoop.util.Time.now;
094import static org.apache.hadoop.util.Time.monotonicNow;
095
096import java.io.BufferedWriter;
097import java.io.ByteArrayInputStream;
098import java.io.DataInput;
099import java.io.DataInputStream;
100import java.io.DataOutputStream;
101import java.io.File;
102import java.io.FileNotFoundException;
103import java.io.FileOutputStream;
104import java.io.IOException;
105import java.io.OutputStreamWriter;
106import java.io.PrintWriter;
107import java.io.StringWriter;
108import java.lang.management.ManagementFactory;
109import java.net.InetAddress;
110import java.net.URI;
111import java.security.GeneralSecurityException;
112import java.util.ArrayList;
113import java.util.Arrays;
114import java.util.Collection;
115import java.util.Collections;
116import java.util.Date;
117import java.util.EnumSet;
118import java.util.HashMap;
119import java.util.HashSet;
120import java.util.Iterator;
121import java.util.LinkedHashSet;
122import java.util.List;
123import java.util.Map;
124import java.util.Set;
125import java.util.TreeMap;
126import java.util.concurrent.TimeUnit;
127import java.util.concurrent.locks.Condition;
128import java.util.concurrent.locks.ReentrantLock;
129import java.util.concurrent.locks.ReentrantReadWriteLock;
130
131import javax.management.NotCompliantMBeanException;
132import javax.management.ObjectName;
133import javax.management.StandardMBean;
134
135import org.apache.commons.logging.Log;
136import org.apache.commons.logging.LogFactory;
137import org.apache.commons.logging.impl.Log4JLogger;
138import org.apache.hadoop.HadoopIllegalArgumentException;
139import org.apache.hadoop.classification.InterfaceAudience;
140import org.apache.hadoop.conf.Configuration;
141import org.apache.hadoop.crypto.CipherSuite;
142import org.apache.hadoop.crypto.CryptoProtocolVersion;
143import org.apache.hadoop.crypto.key.KeyProvider;
144import org.apache.hadoop.crypto.key.KeyProviderCryptoExtension;
145import org.apache.hadoop.fs.BatchedRemoteIterator.BatchedListEntries;
146import org.apache.hadoop.fs.CacheFlag;
147import org.apache.hadoop.fs.ContentSummary;
148import org.apache.hadoop.fs.CreateFlag;
149import org.apache.hadoop.fs.FileAlreadyExistsException;
150import org.apache.hadoop.fs.FileEncryptionInfo;
151import org.apache.hadoop.fs.FileStatus;
152import org.apache.hadoop.fs.FileSystem;
153import org.apache.hadoop.fs.FsServerDefaults;
154import org.apache.hadoop.fs.InvalidPathException;
155import org.apache.hadoop.fs.Options;
156import org.apache.hadoop.fs.ParentNotDirectoryException;
157import org.apache.hadoop.fs.Path;
158import org.apache.hadoop.fs.UnresolvedLinkException;
159import org.apache.hadoop.fs.XAttr;
160import org.apache.hadoop.fs.XAttrSetFlag;
161import org.apache.hadoop.fs.permission.AclEntry;
162import org.apache.hadoop.fs.permission.AclStatus;
163import org.apache.hadoop.fs.permission.FsAction;
164import org.apache.hadoop.fs.permission.FsPermission;
165import org.apache.hadoop.fs.permission.PermissionStatus;
166import org.apache.hadoop.fs.StorageType;
167import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
168import org.apache.hadoop.ha.ServiceFailedException;
169import org.apache.hadoop.hdfs.protocol.BlockStoragePolicy;
170import org.apache.hadoop.hdfs.DFSConfigKeys;
171import org.apache.hadoop.hdfs.DFSUtil;
172import org.apache.hadoop.hdfs.HAUtil;
173import org.apache.hadoop.hdfs.HdfsConfiguration;
174import org.apache.hadoop.hdfs.UnknownCryptoProtocolVersionException;
175import org.apache.hadoop.hdfs.XAttrHelper;
176import org.apache.hadoop.hdfs.protocol.AlreadyBeingCreatedException;
177import org.apache.hadoop.hdfs.protocol.Block;
178import org.apache.hadoop.hdfs.protocol.CacheDirectiveEntry;
179import org.apache.hadoop.hdfs.protocol.CacheDirectiveInfo;
180import org.apache.hadoop.hdfs.protocol.CachePoolEntry;
181import org.apache.hadoop.hdfs.protocol.CachePoolInfo;
182import org.apache.hadoop.hdfs.protocol.ClientProtocol;
183import org.apache.hadoop.hdfs.protocol.DatanodeID;
184import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
185import org.apache.hadoop.hdfs.protocol.DirectoryListing;
186import org.apache.hadoop.hdfs.protocol.EncryptionZone;
187import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
188import org.apache.hadoop.hdfs.protocol.HdfsConstants;
189import org.apache.hadoop.hdfs.protocol.LastBlockWithStatus;
190import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType;
191import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction;
192import org.apache.hadoop.hdfs.protocol.HdfsFileStatus;
193import org.apache.hadoop.hdfs.protocol.LocatedBlock;
194import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
195import org.apache.hadoop.hdfs.protocol.QuotaExceededException;
196import org.apache.hadoop.hdfs.protocol.RecoveryInProgressException;
197import org.apache.hadoop.hdfs.protocol.RollingUpgradeException;
198import org.apache.hadoop.hdfs.protocol.RollingUpgradeInfo;
199import org.apache.hadoop.hdfs.protocol.SnapshotAccessControlException;
200import org.apache.hadoop.hdfs.protocol.SnapshotDiffReport;
201import org.apache.hadoop.hdfs.protocol.SnapshottableDirectoryStatus;
202import org.apache.hadoop.hdfs.protocol.datatransfer.ReplaceDatanodeOnFailure;
203import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager;
204import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager.AccessMode;
205import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier;
206import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenSecretManager;
207import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenSecretManager.SecretManagerState;
208import org.apache.hadoop.hdfs.server.blockmanagement.BlockCollection;
209import org.apache.hadoop.hdfs.server.blockmanagement.BlockIdManager;
210import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoContiguous;
211import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoContiguousUnderConstruction;
212import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager;
213import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor;
214import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeManager;
215import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeStatistics;
216import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeStorageInfo;
217import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.BlockUCState;
218import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.NamenodeRole;
219import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.RollingUpgradeStartupOption;
220import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption;
221import org.apache.hadoop.hdfs.server.common.Storage;
222import org.apache.hadoop.hdfs.server.common.Storage.StorageDirType;
223import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
224import org.apache.hadoop.hdfs.server.common.Util;
225import org.apache.hadoop.hdfs.server.namenode.FsImageProto.SecretManagerSection;
226import org.apache.hadoop.hdfs.server.namenode.INode.BlocksMapUpdateInfo;
227import org.apache.hadoop.hdfs.server.namenode.JournalSet.JournalAndStream;
228import org.apache.hadoop.hdfs.server.namenode.LeaseManager.Lease;
229import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeFile;
230import org.apache.hadoop.hdfs.server.namenode.NameNode.OperationCategory;
231import org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer;
232import org.apache.hadoop.hdfs.server.namenode.ha.HAContext;
233import org.apache.hadoop.hdfs.server.namenode.ha.StandbyCheckpointer;
234import org.apache.hadoop.hdfs.server.namenode.metrics.FSNamesystemMBean;
235import org.apache.hadoop.hdfs.server.namenode.metrics.NameNodeMetrics;
236import org.apache.hadoop.hdfs.server.namenode.snapshot.Snapshot;
237import org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotManager;
238import org.apache.hadoop.hdfs.server.namenode.startupprogress.Phase;
239import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress;
240import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress.Counter;
241import org.apache.hadoop.hdfs.server.namenode.startupprogress.Status;
242import org.apache.hadoop.hdfs.server.namenode.startupprogress.Step;
243import org.apache.hadoop.hdfs.server.namenode.startupprogress.StepType;
244import org.apache.hadoop.hdfs.server.namenode.top.TopAuditLogger;
245import org.apache.hadoop.hdfs.server.namenode.top.TopConf;
246import org.apache.hadoop.hdfs.server.namenode.top.metrics.TopMetrics;
247import org.apache.hadoop.hdfs.server.namenode.top.window.RollingWindowManager;
248import org.apache.hadoop.hdfs.server.namenode.web.resources.NamenodeWebHdfsMethods;
249import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand;
250import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
251import org.apache.hadoop.hdfs.server.protocol.DatanodeStorageReport;
252import org.apache.hadoop.hdfs.server.protocol.HeartbeatResponse;
253import org.apache.hadoop.hdfs.server.protocol.NNHAStatusHeartbeat;
254import org.apache.hadoop.hdfs.server.protocol.NamenodeCommand;
255import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration;
256import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
257import org.apache.hadoop.hdfs.server.protocol.StorageReceivedDeletedBlocks;
258import org.apache.hadoop.hdfs.server.protocol.StorageReport;
259import org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary;
260import org.apache.hadoop.io.EnumSetWritable;
261import org.apache.hadoop.io.IOUtils;
262import org.apache.hadoop.io.Text;
263import org.apache.hadoop.ipc.RetriableException;
264import org.apache.hadoop.ipc.RetryCache;
265import org.apache.hadoop.ipc.Server;
266import org.apache.hadoop.ipc.StandbyException;
267import org.apache.hadoop.metrics2.annotation.Metric;
268import org.apache.hadoop.metrics2.annotation.Metrics;
269import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
270import org.apache.hadoop.metrics2.lib.MetricsRegistry;
271import org.apache.hadoop.metrics2.lib.MutableRatesWithAggregation;
272import org.apache.hadoop.metrics2.util.MBeans;
273import org.apache.hadoop.net.NetworkTopology;
274import org.apache.hadoop.net.Node;
275import org.apache.hadoop.net.NodeBase;
276import org.apache.hadoop.security.AccessControlException;
277import org.apache.hadoop.security.UserGroupInformation;
278import org.apache.hadoop.security.UserGroupInformation.AuthenticationMethod;
279import org.apache.hadoop.security.token.SecretManager.InvalidToken;
280import org.apache.hadoop.security.token.Token;
281import org.apache.hadoop.security.token.TokenIdentifier;
282import org.apache.hadoop.security.token.delegation.DelegationKey;
283import org.apache.hadoop.util.ChunkedArrayList;
284import org.apache.hadoop.util.Daemon;
285import org.apache.hadoop.util.DataChecksum;
286import org.apache.hadoop.util.ReflectionUtils;
287import org.apache.hadoop.util.StringUtils;
288import org.apache.hadoop.util.VersionInfo;
289import org.apache.log4j.Appender;
290import org.apache.log4j.AsyncAppender;
291import org.apache.log4j.Logger;
292import org.codehaus.jackson.map.ObjectMapper;
293import org.mortbay.util.ajax.JSON;
294
295import com.google.common.annotations.VisibleForTesting;
296import com.google.common.base.Charsets;
297import com.google.common.base.Preconditions;
298import com.google.common.collect.ImmutableMap;
299import com.google.common.collect.Lists;
300
301/***************************************************
302 * FSNamesystem does the actual bookkeeping work for the
303 * DataNode.
304 *
305 * It tracks several important tables.
306 *
307 * 1)  valid fsname --> blocklist  (kept on disk, logged)
308 * 2)  Set of all valid blocks (inverted #1)
309 * 3)  block --> machinelist (kept in memory, rebuilt dynamically from reports)
310 * 4)  machine --> blocklist (inverted #2)
311 * 5)  LRU cache of updated-heartbeat machines
312 ***************************************************/
313@InterfaceAudience.Private
314@Metrics(context="dfs")
315public class FSNamesystem implements Namesystem, FSNamesystemMBean,
316  NameNodeMXBean {
317  public static final Log LOG = LogFactory.getLog(FSNamesystem.class);
318  private final MetricsRegistry registry = new MetricsRegistry("FSNamesystem");
319  @Metric final MutableRatesWithAggregation detailedLockHoldTimeMetrics =
320      registry.newRatesWithAggregation("detailedLockHoldTimeMetrics");
321
322  private static final ThreadLocal<StringBuilder> auditBuffer =
323    new ThreadLocal<StringBuilder>() {
324      @Override
325      protected StringBuilder initialValue() {
326        return new StringBuilder();
327      }
328  };
329
330  private final BlockIdManager blockIdManager;
331
332  @VisibleForTesting
333  public boolean isAuditEnabled() {
334    return !isDefaultAuditLogger || auditLog.isInfoEnabled();
335  }
336
337  private void logAuditEvent(boolean succeeded, String cmd, String src)
338      throws IOException {
339    logAuditEvent(succeeded, cmd, src, null, null);
340  }
341  
342  private void logAuditEvent(boolean succeeded, String cmd, String src,
343      String dst, HdfsFileStatus stat) throws IOException {
344    if (isAuditEnabled() && isExternalInvocation()) {
345      logAuditEvent(succeeded, getRemoteUser(), getRemoteIp(),
346                    cmd, src, dst, stat);
347    }
348  }
349
350  private void logAuditEvent(boolean succeeded,
351      UserGroupInformation ugi, InetAddress addr, String cmd, String src,
352      String dst, HdfsFileStatus stat) {
353    FileStatus status = null;
354    if (stat != null) {
355      Path symlink = stat.isSymlink() ? new Path(stat.getSymlink()) : null;
356      Path path = dst != null ? new Path(dst) : new Path(src);
357      status = new FileStatus(stat.getLen(), stat.isDir(),
358          stat.getReplication(), stat.getBlockSize(), stat.getModificationTime(),
359          stat.getAccessTime(), stat.getPermission(), stat.getOwner(),
360          stat.getGroup(), symlink, path);
361    }
362    for (AuditLogger logger : auditLoggers) {
363      if (logger instanceof HdfsAuditLogger) {
364        HdfsAuditLogger hdfsLogger = (HdfsAuditLogger) logger;
365        hdfsLogger.logAuditEvent(succeeded, ugi.toString(), addr, cmd, src, dst,
366            status, ugi, dtSecretManager);
367      } else {
368        logger.logAuditEvent(succeeded, ugi.toString(), addr,
369            cmd, src, dst, status);
370      }
371    }
372  }
373
374  /**
375   * Logger for audit events, noting successful FSNamesystem operations. Emits
376   * to FSNamesystem.audit at INFO. Each event causes a set of tab-separated
377   * <code>key=value</code> pairs to be written for the following properties:
378   * <code>
379   * ugi=&lt;ugi in RPC&gt;
380   * ip=&lt;remote IP&gt;
381   * cmd=&lt;command&gt;
382   * src=&lt;src path&gt;
383   * dst=&lt;dst path (optional)&gt;
384   * perm=&lt;permissions (optional)&gt;
385   * </code>
386   */
387  public static final Log auditLog = LogFactory.getLog(
388      FSNamesystem.class.getName() + ".audit");
389
390  static final int DEFAULT_MAX_CORRUPT_FILEBLOCKS_RETURNED = 100;
391  static int BLOCK_DELETION_INCREMENT = 1000;
392  private final boolean isPermissionEnabled;
393  private final UserGroupInformation fsOwner;
394  private final String supergroup;
395  private final boolean standbyShouldCheckpoint;
396  
397  // Scan interval is not configurable.
398  private static final long DELEGATION_TOKEN_REMOVER_SCAN_INTERVAL =
399    TimeUnit.MILLISECONDS.convert(1, TimeUnit.HOURS);
400  final DelegationTokenSecretManager dtSecretManager;
401  private final boolean alwaysUseDelegationTokensForTests;
402
403  private static final Step STEP_AWAITING_REPORTED_BLOCKS =
404    new Step(StepType.AWAITING_REPORTED_BLOCKS);
405
406  // Tracks whether the default audit logger is the only configured audit
407  // logger; this allows isAuditEnabled() to return false in case the
408  // underlying logger is disabled, and avoid some unnecessary work.
409  private final boolean isDefaultAuditLogger;
410  private final List<AuditLogger> auditLoggers;
411
412  /** The namespace tree. */
413  FSDirectory dir;
414  private final BlockManager blockManager;
415  private final SnapshotManager snapshotManager;
416  private final CacheManager cacheManager;
417  private final DatanodeStatistics datanodeStatistics;
418
419  private String nameserviceId;
420
421  private volatile RollingUpgradeInfo rollingUpgradeInfo = null;
422  /**
423   * A flag that indicates whether the checkpointer should checkpoint a rollback
424   * fsimage. The edit log tailer sets this flag. The checkpoint will create a
425   * rollback fsimage if the flag is true, and then change the flag to false.
426   */
427  private volatile boolean needRollbackFsImage;
428
429  // Block pool ID used by this namenode
430  private String blockPoolId;
431
432  final LeaseManager leaseManager = new LeaseManager(this); 
433
434  volatile Daemon smmthread = null;  // SafeModeMonitor thread
435  
436  Daemon nnrmthread = null; // NamenodeResourceMonitor thread
437
438  Daemon nnEditLogRoller = null; // NameNodeEditLogRoller thread
439
440  // A daemon to periodically clean up corrupt lazyPersist files
441  // from the name space.
442  Daemon lazyPersistFileScrubber = null;
443  /**
444   * When an active namenode will roll its own edit log, in # edits
445   */
446  private final long editLogRollerThreshold;
447  /**
448   * Check interval of an active namenode's edit log roller thread 
449   */
450  private final int editLogRollerInterval;
451
452  /**
453   * How frequently we scan and unlink corrupt lazyPersist files.
454   * (In seconds)
455   */
456  private final int lazyPersistFileScrubIntervalSec;
457
458  private volatile boolean hasResourcesAvailable = false;
459  private volatile boolean fsRunning = true;
460  
461  /** The start time of the namesystem. */
462  private final long startTime = now();
463
464  /** The interval of namenode checking for the disk space availability */
465  private final long resourceRecheckInterval;
466
467  // The actual resource checker instance.
468  NameNodeResourceChecker nnResourceChecker;
469
470  private final FsServerDefaults serverDefaults;
471  private final boolean supportAppends;
472  private final ReplaceDatanodeOnFailure dtpReplaceDatanodeOnFailure;
473
474  private volatile SafeModeInfo safeMode;  // safe mode information
475
476  private final long maxFsObjects;          // maximum number of fs objects
477
478  private final long minBlockSize;         // minimum block size
479  private final long maxBlocksPerFile;     // maximum # of blocks per file
480
481  // precision of access times.
482  private final long accessTimePrecision;
483
484  /** Lock to protect FSNamesystem. */
485  private final FSNamesystemLock fsLock;
486
487  /** 
488   * Checkpoint lock to protect FSNamesystem modification on standby NNs.
489   * Unlike fsLock, it does not affect block updates. On active NNs, this lock
490   * does not provide proper protection, because there are operations that
491   * modify both block and name system state.  Even on standby, fsLock is 
492   * used when block state changes need to be blocked.
493   */
494  private final ReentrantLock cpLock;
495
496  /**
497   * Used when this NN is in standby state to read from the shared edit log.
498   */
499  private EditLogTailer editLogTailer = null;
500
501  /**
502   * Used when this NN is in standby state to perform checkpoints.
503   */
504  private StandbyCheckpointer standbyCheckpointer;
505
506  /**
507   * Reference to the NN's HAContext object. This is only set once
508   * {@link #startCommonServices(Configuration, HAContext)} is called. 
509   */
510  private HAContext haContext;
511
512  private final boolean haEnabled;
513
514  /** flag indicating whether replication queues have been initialized */
515  boolean initializedReplQueues = false;
516
517  /**
518   * Whether the namenode is in the middle of starting the active service
519   */
520  private volatile boolean startingActiveService = false;
521
522  private final RetryCache retryCache;
523
524  private KeyProviderCryptoExtension provider = null;
525
526  private volatile boolean imageLoaded = false;
527  private final Condition cond;
528
529  private final FSImage fsImage;
530
531  private final TopConf topConf;
532  private TopMetrics topMetrics;
533
534  private INodeAttributeProvider inodeAttributeProvider;
535
536  /**
537   * Notify that loading of this FSDirectory is complete, and
538   * it is imageLoaded for use
539   */
540  void imageLoadComplete() {
541    Preconditions.checkState(!imageLoaded, "FSDirectory already loaded");
542    setImageLoaded();
543  }
544
545  void setImageLoaded() {
546    if(imageLoaded) return;
547    writeLock();
548    try {
549      setImageLoaded(true);
550      dir.markNameCacheInitialized();
551      cond.signalAll();
552    } finally {
553      writeUnlock("setImageLoaded");
554    }
555  }
556
557  //This is for testing purposes only
558  @VisibleForTesting
559  boolean isImageLoaded() {
560    return imageLoaded;
561  }
562
563  // exposed for unit tests
564  protected void setImageLoaded(boolean flag) {
565    imageLoaded = flag;
566  }
567
568  /**
569   * Block until the object is imageLoaded to be used.
570   */
571  void waitForLoadingFSImage() {
572    if (!imageLoaded) {
573      writeLock();
574      try {
575        while (!imageLoaded) {
576          try {
577            cond.await(5000, TimeUnit.MILLISECONDS);
578          } catch (InterruptedException ignored) {
579          }
580        }
581      } finally {
582        writeUnlock();
583      }
584    }
585  }
586
587  /**
588   * Clear all loaded data
589   */
590  void clear() {
591    dir.reset();
592    dtSecretManager.reset();
593    blockIdManager.clear();
594    leaseManager.removeAllLeases();
595    snapshotManager.clearSnapshottableDirs();
596    cacheManager.clear();
597    setImageLoaded(false);
598    blockManager.clear();
599  }
600
601  @VisibleForTesting
602  LeaseManager getLeaseManager() {
603    return leaseManager;
604  }
605  
606  boolean isHaEnabled() {
607    return haEnabled;
608  }
609  
610  /**
611   * Check the supplied configuration for correctness.
612   * @param conf Supplies the configuration to validate.
613   * @throws IOException if the configuration could not be queried.
614   * @throws IllegalArgumentException if the configuration is invalid.
615   */
616  private static void checkConfiguration(Configuration conf)
617      throws IOException {
618
619    final Collection<URI> namespaceDirs =
620        FSNamesystem.getNamespaceDirs(conf);
621    final Collection<URI> editsDirs =
622        FSNamesystem.getNamespaceEditsDirs(conf);
623    final Collection<URI> requiredEditsDirs =
624        FSNamesystem.getRequiredNamespaceEditsDirs(conf);
625    final Collection<URI> sharedEditsDirs =
626        FSNamesystem.getSharedEditsDirs(conf);
627
628    for (URI u : requiredEditsDirs) {
629      if (u.toString().compareTo(
630              DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_DEFAULT) == 0) {
631        continue;
632      }
633
634      // Each required directory must also be in editsDirs or in
635      // sharedEditsDirs.
636      if (!editsDirs.contains(u) &&
637          !sharedEditsDirs.contains(u)) {
638        throw new IllegalArgumentException(
639            "Required edits directory " + u.toString() + " not present in " +
640            DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY + ". " +
641            DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY + "=" +
642            editsDirs.toString() + "; " +
643            DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY + "=" +
644            requiredEditsDirs.toString() + ". " +
645            DFSConfigKeys.DFS_NAMENODE_SHARED_EDITS_DIR_KEY + "=" +
646            sharedEditsDirs.toString() + ".");
647      }
648    }
649
650    if (namespaceDirs.size() == 1) {
651      LOG.warn("Only one image storage directory ("
652          + DFS_NAMENODE_NAME_DIR_KEY + ") configured. Beware of data loss"
653          + " due to lack of redundant storage directories!");
654    }
655    if (editsDirs.size() == 1) {
656      LOG.warn("Only one namespace edits storage directory ("
657          + DFS_NAMENODE_EDITS_DIR_KEY + ") configured. Beware of data loss"
658          + " due to lack of redundant storage directories!");
659    }
660  }
661
662  /**
663   * Instantiates an FSNamesystem loaded from the image and edits
664   * directories specified in the passed Configuration.
665   *
666   * @param conf the Configuration which specifies the storage directories
667   *             from which to load
668   * @return an FSNamesystem which contains the loaded namespace
669   * @throws IOException if loading fails
670   */
671  static FSNamesystem loadFromDisk(Configuration conf) throws IOException {
672
673    checkConfiguration(conf);
674    FSImage fsImage = new FSImage(conf,
675        FSNamesystem.getNamespaceDirs(conf),
676        FSNamesystem.getNamespaceEditsDirs(conf));
677    FSNamesystem namesystem = new FSNamesystem(conf, fsImage, false);
678    StartupOption startOpt = NameNode.getStartupOption(conf);
679    if (startOpt == StartupOption.RECOVER) {
680      namesystem.setSafeMode(SafeModeAction.SAFEMODE_ENTER);
681    }
682
683    long loadStart = monotonicNow();
684    try {
685      namesystem.loadFSImage(startOpt);
686    } catch (IOException ioe) {
687      LOG.warn("Encountered exception loading fsimage", ioe);
688      fsImage.close();
689      throw ioe;
690    }
691    long timeTakenToLoadFSImage = monotonicNow() - loadStart;
692    LOG.info("Finished loading FSImage in " + timeTakenToLoadFSImage + " msecs");
693    NameNodeMetrics nnMetrics = NameNode.getNameNodeMetrics();
694    if (nnMetrics != null) {
695      nnMetrics.setFsImageLoadTime((int) timeTakenToLoadFSImage);
696    }
697    return namesystem;
698  }
699  
700  FSNamesystem(Configuration conf, FSImage fsImage) throws IOException {
701    this(conf, fsImage, false);
702  }
703  
704  /**
705   * Create an FSNamesystem associated with the specified image.
706   * 
707   * Note that this does not load any data off of disk -- if you would
708   * like that behavior, use {@link #loadFromDisk(Configuration)}
709   *
710   * @param conf configuration
711   * @param fsImage The FSImage to associate with
712   * @param ignoreRetryCache Whether or not should ignore the retry cache setup
713   *                         step. For Secondary NN this should be set to true.
714   * @throws IOException on bad configuration
715   */
716  FSNamesystem(Configuration conf, FSImage fsImage, boolean ignoreRetryCache)
717      throws IOException {
718    provider = DFSUtil.createKeyProviderCryptoExtension(conf);
719    if (provider == null) {
720      LOG.info("No KeyProvider found.");
721    } else {
722      LOG.info("Found KeyProvider: " + provider.toString());
723    }
724    if (conf.getBoolean(DFS_NAMENODE_AUDIT_LOG_ASYNC_KEY,
725                        DFS_NAMENODE_AUDIT_LOG_ASYNC_DEFAULT)) {
726      LOG.info("Enabling async auditlog");
727      enableAsyncAuditLog();
728    }
729    fsLock = new FSNamesystemLock(conf, detailedLockHoldTimeMetrics);
730    cond = fsLock.newWriteLockCondition();
731    cpLock = new ReentrantLock();
732
733    this.fsImage = fsImage;
734    try {
735      resourceRecheckInterval = conf.getLong(
736          DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_KEY,
737          DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_DEFAULT);
738
739      this.blockManager = new BlockManager(this, conf);
740      this.datanodeStatistics = blockManager.getDatanodeManager().getDatanodeStatistics();
741      this.blockIdManager = new BlockIdManager(blockManager);
742
743      this.fsOwner = UserGroupInformation.getCurrentUser();
744      this.supergroup = conf.get(DFS_PERMISSIONS_SUPERUSERGROUP_KEY, 
745                                 DFS_PERMISSIONS_SUPERUSERGROUP_DEFAULT);
746      this.isPermissionEnabled = conf.getBoolean(DFS_PERMISSIONS_ENABLED_KEY,
747                                                 DFS_PERMISSIONS_ENABLED_DEFAULT);
748      LOG.info("fsOwner             = " + fsOwner);
749      LOG.info("supergroup          = " + supergroup);
750      LOG.info("isPermissionEnabled = " + isPermissionEnabled);
751
752      // block allocation has to be persisted in HA using a shared edits directory
753      // so that the standby has up-to-date namespace information
754      nameserviceId = DFSUtil.getNamenodeNameServiceId(conf);
755      this.haEnabled = HAUtil.isHAEnabled(conf, nameserviceId);  
756      
757      // Sanity check the HA-related config.
758      if (nameserviceId != null) {
759        LOG.info("Determined nameservice ID: " + nameserviceId);
760      }
761      LOG.info("HA Enabled: " + haEnabled);
762      if (!haEnabled && HAUtil.usesSharedEditsDir(conf)) {
763        LOG.warn("Configured NNs:\n" + DFSUtil.nnAddressesAsString(conf));
764        throw new IOException("Invalid configuration: a shared edits dir " +
765            "must not be specified if HA is not enabled.");
766      }
767
768      // Get the checksum type from config
769      String checksumTypeStr = conf.get(DFS_CHECKSUM_TYPE_KEY, DFS_CHECKSUM_TYPE_DEFAULT);
770      DataChecksum.Type checksumType;
771      try {
772         checksumType = DataChecksum.Type.valueOf(checksumTypeStr);
773      } catch (IllegalArgumentException iae) {
774         throw new IOException("Invalid checksum type in "
775            + DFS_CHECKSUM_TYPE_KEY + ": " + checksumTypeStr);
776      }
777
778      this.serverDefaults = new FsServerDefaults(
779          conf.getLongBytes(DFS_BLOCK_SIZE_KEY, DFS_BLOCK_SIZE_DEFAULT),
780          conf.getInt(DFS_BYTES_PER_CHECKSUM_KEY, DFS_BYTES_PER_CHECKSUM_DEFAULT),
781          conf.getInt(DFS_CLIENT_WRITE_PACKET_SIZE_KEY, DFS_CLIENT_WRITE_PACKET_SIZE_DEFAULT),
782          (short) conf.getInt(DFS_REPLICATION_KEY, DFS_REPLICATION_DEFAULT),
783          conf.getInt(IO_FILE_BUFFER_SIZE_KEY, IO_FILE_BUFFER_SIZE_DEFAULT),
784          conf.getBoolean(DFS_ENCRYPT_DATA_TRANSFER_KEY, DFS_ENCRYPT_DATA_TRANSFER_DEFAULT),
785          conf.getLong(FS_TRASH_INTERVAL_KEY, FS_TRASH_INTERVAL_DEFAULT),
786          checksumType);
787      
788      this.maxFsObjects = conf.getLong(DFS_NAMENODE_MAX_OBJECTS_KEY, 
789                                       DFS_NAMENODE_MAX_OBJECTS_DEFAULT);
790
791      this.minBlockSize = conf.getLong(DFSConfigKeys.DFS_NAMENODE_MIN_BLOCK_SIZE_KEY,
792          DFSConfigKeys.DFS_NAMENODE_MIN_BLOCK_SIZE_DEFAULT);
793      this.maxBlocksPerFile = conf.getLong(DFSConfigKeys.DFS_NAMENODE_MAX_BLOCKS_PER_FILE_KEY,
794          DFSConfigKeys.DFS_NAMENODE_MAX_BLOCKS_PER_FILE_DEFAULT);
795      this.accessTimePrecision = conf.getLong(DFS_NAMENODE_ACCESSTIME_PRECISION_KEY,
796          DFS_NAMENODE_ACCESSTIME_PRECISION_DEFAULT);
797      this.supportAppends = conf.getBoolean(DFS_SUPPORT_APPEND_KEY, DFS_SUPPORT_APPEND_DEFAULT);
798      LOG.info("Append Enabled: " + supportAppends);
799
800      this.dtpReplaceDatanodeOnFailure = ReplaceDatanodeOnFailure.get(conf);
801      
802      this.standbyShouldCheckpoint = conf.getBoolean(
803          DFS_HA_STANDBY_CHECKPOINTS_KEY, DFS_HA_STANDBY_CHECKPOINTS_DEFAULT);
804      // # edit autoroll threshold is a multiple of the checkpoint threshold 
805      this.editLogRollerThreshold = (long)
806          (conf.getFloat(
807              DFS_NAMENODE_EDIT_LOG_AUTOROLL_MULTIPLIER_THRESHOLD,
808              DFS_NAMENODE_EDIT_LOG_AUTOROLL_MULTIPLIER_THRESHOLD_DEFAULT) *
809          conf.getLong(
810              DFS_NAMENODE_CHECKPOINT_TXNS_KEY,
811              DFS_NAMENODE_CHECKPOINT_TXNS_DEFAULT));
812      this.editLogRollerInterval = conf.getInt(
813          DFS_NAMENODE_EDIT_LOG_AUTOROLL_CHECK_INTERVAL_MS,
814          DFS_NAMENODE_EDIT_LOG_AUTOROLL_CHECK_INTERVAL_MS_DEFAULT);
815
816      this.lazyPersistFileScrubIntervalSec = conf.getInt(
817          DFS_NAMENODE_LAZY_PERSIST_FILE_SCRUB_INTERVAL_SEC,
818          DFS_NAMENODE_LAZY_PERSIST_FILE_SCRUB_INTERVAL_SEC_DEFAULT);
819
820      if (this.lazyPersistFileScrubIntervalSec == 0) {
821        throw new IllegalArgumentException(
822            DFS_NAMENODE_LAZY_PERSIST_FILE_SCRUB_INTERVAL_SEC + " must be non-zero.");
823      }
824
825      // For testing purposes, allow the DT secret manager to be started regardless
826      // of whether security is enabled.
827      alwaysUseDelegationTokensForTests = conf.getBoolean(
828          DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY,
829          DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_DEFAULT);
830      
831      this.dtSecretManager = createDelegationTokenSecretManager(conf);
832      this.dir = new FSDirectory(this, conf);
833      this.snapshotManager = new SnapshotManager(dir);
834      this.cacheManager = new CacheManager(this, conf, blockManager);
835      this.safeMode = new SafeModeInfo(conf);
836      this.topConf = new TopConf(conf);
837      this.auditLoggers = initAuditLoggers(conf);
838      this.isDefaultAuditLogger = auditLoggers.size() == 1 &&
839        auditLoggers.get(0) instanceof DefaultAuditLogger;
840      this.retryCache = ignoreRetryCache ? null : initRetryCache(conf);
841      Class<? extends INodeAttributeProvider> klass = conf.getClass(
842          DFS_NAMENODE_INODE_ATTRIBUTES_PROVIDER_KEY,
843          null, INodeAttributeProvider.class);
844      if (klass != null) {
845        inodeAttributeProvider = ReflectionUtils.newInstance(klass, conf);
846        LOG.info("Using INode attribute provider: " + klass.getName());
847      }
848    } catch(IOException e) {
849      LOG.error(getClass().getSimpleName() + " initialization failed.", e);
850      close();
851      throw e;
852    } catch (RuntimeException re) {
853      LOG.error(getClass().getSimpleName() + " initialization failed.", re);
854      close();
855      throw re;
856    }
857  }
858
859  @VisibleForTesting
860  public List<AuditLogger> getAuditLoggers() {
861    return auditLoggers;
862  }
863
864  @VisibleForTesting
865  public RetryCache getRetryCache() {
866    return retryCache;
867  }
868
869  void lockRetryCache() {
870    if (retryCache != null) {
871      retryCache.lock();
872    }
873  }
874
875  void unlockRetryCache() {
876    if (retryCache != null) {
877      retryCache.unlock();
878    }
879  }
880
881  /** Whether or not retry cache is enabled */
882  boolean hasRetryCache() {
883    return retryCache != null;
884  }
885  
886  void addCacheEntryWithPayload(byte[] clientId, int callId, Object payload) {
887    if (retryCache != null) {
888      retryCache.addCacheEntryWithPayload(clientId, callId, payload);
889    }
890  }
891  
892  void addCacheEntry(byte[] clientId, int callId) {
893    if (retryCache != null) {
894      retryCache.addCacheEntry(clientId, callId);
895    }
896  }
897
898  @VisibleForTesting
899  public KeyProviderCryptoExtension getProvider() {
900    return provider;
901  }
902
903  @VisibleForTesting
904  static RetryCache initRetryCache(Configuration conf) {
905    boolean enable = conf.getBoolean(DFS_NAMENODE_ENABLE_RETRY_CACHE_KEY,
906                                     DFS_NAMENODE_ENABLE_RETRY_CACHE_DEFAULT);
907    LOG.info("Retry cache on namenode is " + (enable ? "enabled" : "disabled"));
908    if (enable) {
909      float heapPercent = conf.getFloat(
910          DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_KEY,
911          DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_DEFAULT);
912      long entryExpiryMillis = conf.getLong(
913          DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_KEY,
914          DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_DEFAULT);
915      LOG.info("Retry cache will use " + heapPercent
916          + " of total heap and retry cache entry expiry time is "
917          + entryExpiryMillis + " millis");
918      long entryExpiryNanos = entryExpiryMillis * 1000 * 1000;
919      return new RetryCache("NameNodeRetryCache", heapPercent,
920          entryExpiryNanos);
921    }
922    return null;
923  }
924
925  private List<AuditLogger> initAuditLoggers(Configuration conf) {
926    // Initialize the custom access loggers if configured.
927    Collection<String> alClasses = conf.getStringCollection(DFS_NAMENODE_AUDIT_LOGGERS_KEY);
928    List<AuditLogger> auditLoggers = Lists.newArrayList();
929    if (alClasses != null && !alClasses.isEmpty()) {
930      for (String className : alClasses) {
931        try {
932          AuditLogger logger;
933          if (DFS_NAMENODE_DEFAULT_AUDIT_LOGGER_NAME.equals(className)) {
934            logger = new DefaultAuditLogger();
935          } else {
936            logger = (AuditLogger) Class.forName(className).newInstance();
937          }
938          logger.initialize(conf);
939          auditLoggers.add(logger);
940        } catch (RuntimeException re) {
941          throw re;
942        } catch (Exception e) {
943          throw new RuntimeException(e);
944        }
945      }
946    }
947
948    // Make sure there is at least one logger installed.
949    if (auditLoggers.isEmpty()) {
950      auditLoggers.add(new DefaultAuditLogger());
951    }
952
953    // Add audit logger to calculate top users
954    if (topConf.isEnabled) {
955      topMetrics = new TopMetrics(conf, topConf.nntopReportingPeriodsMs);
956      auditLoggers.add(new TopAuditLogger(topMetrics));
957    }
958
959    return Collections.unmodifiableList(auditLoggers);
960  }
961
962  private void loadFSImage(StartupOption startOpt) throws IOException {
963    final FSImage fsImage = getFSImage();
964
965    // format before starting up if requested
966    if (startOpt == StartupOption.FORMAT) {
967      
968      fsImage.format(this, fsImage.getStorage().determineClusterId());// reuse current id
969
970      startOpt = StartupOption.REGULAR;
971    }
972    boolean success = false;
973    writeLock();
974    try {
975      // We shouldn't be calling saveNamespace if we've come up in standby state.
976      MetaRecoveryContext recovery = startOpt.createRecoveryContext();
977      final boolean staleImage
978          = fsImage.recoverTransitionRead(startOpt, this, recovery);
979      if (RollingUpgradeStartupOption.ROLLBACK.matches(startOpt) ||
980          RollingUpgradeStartupOption.DOWNGRADE.matches(startOpt)) {
981        rollingUpgradeInfo = null;
982      }
983      final boolean needToSave = staleImage && !haEnabled && !isRollingUpgrade(); 
984      LOG.info("Need to save fs image? " + needToSave
985          + " (staleImage=" + staleImage + ", haEnabled=" + haEnabled
986          + ", isRollingUpgrade=" + isRollingUpgrade() + ")");
987      if (needToSave) {
988        fsImage.saveNamespace(this);
989      } else {
990        updateStorageVersionForRollingUpgrade(fsImage.getLayoutVersion(),
991            startOpt);
992        // No need to save, so mark the phase done.
993        StartupProgress prog = NameNode.getStartupProgress();
994        prog.beginPhase(Phase.SAVING_CHECKPOINT);
995        prog.endPhase(Phase.SAVING_CHECKPOINT);
996      }
997      // This will start a new log segment and write to the seen_txid file, so
998      // we shouldn't do it when coming up in standby state
999      if (!haEnabled || (haEnabled && startOpt == StartupOption.UPGRADE)
1000          || (haEnabled && startOpt == StartupOption.UPGRADEONLY)) {
1001        fsImage.openEditLogForWrite();
1002      }
1003      success = true;
1004    } finally {
1005      if (!success) {
1006        fsImage.close();
1007      }
1008      writeUnlock("loadFSImage");
1009    }
1010    imageLoadComplete();
1011  }
1012
1013  private void updateStorageVersionForRollingUpgrade(final long layoutVersion,
1014      StartupOption startOpt) throws IOException {
1015    boolean rollingStarted = RollingUpgradeStartupOption.STARTED
1016        .matches(startOpt) && layoutVersion > HdfsConstants
1017        .NAMENODE_LAYOUT_VERSION;
1018    boolean rollingRollback = RollingUpgradeStartupOption.ROLLBACK
1019        .matches(startOpt);
1020    if (rollingRollback || rollingStarted) {
1021      fsImage.updateStorageVersion();
1022    }
1023  }
1024
1025  private void startSecretManager() {
1026    if (dtSecretManager != null) {
1027      try {
1028        dtSecretManager.startThreads();
1029      } catch (IOException e) {
1030        // Inability to start secret manager
1031        // can't be recovered from.
1032        throw new RuntimeException(e);
1033      }
1034    }
1035  }
1036  
1037  private void startSecretManagerIfNecessary() {
1038    boolean shouldRun = shouldUseDelegationTokens() &&
1039      !isInSafeMode() && getEditLog().isOpenForWrite();
1040    boolean running = dtSecretManager.isRunning();
1041    if (shouldRun && !running) {
1042      startSecretManager();
1043    }
1044  }
1045
1046  private void stopSecretManager() {
1047    if (dtSecretManager != null) {
1048      dtSecretManager.stopThreads();
1049    }
1050  }
1051  
1052  /** 
1053   * Start services common to both active and standby states
1054   */
1055  void startCommonServices(Configuration conf, HAContext haContext) throws IOException {
1056    this.registerMBean(); // register the MBean for the FSNamesystemState
1057    writeLock();
1058    this.haContext = haContext;
1059    try {
1060      nnResourceChecker = new NameNodeResourceChecker(conf);
1061      checkAvailableResources();
1062      assert safeMode != null && !isPopulatingReplQueues();
1063      StartupProgress prog = NameNode.getStartupProgress();
1064      prog.beginPhase(Phase.SAFEMODE);
1065      prog.setTotal(Phase.SAFEMODE, STEP_AWAITING_REPORTED_BLOCKS,
1066        getCompleteBlocksTotal());
1067      setBlockTotal();
1068      blockManager.activate(conf);
1069    } finally {
1070      writeUnlock("startCommonServices");
1071    }
1072    
1073    registerMXBean();
1074    DefaultMetricsSystem.instance().register(this);
1075    if (inodeAttributeProvider != null) {
1076      inodeAttributeProvider.start();
1077      dir.setINodeAttributeProvider(inodeAttributeProvider);
1078    }
1079    snapshotManager.registerMXBean();
1080  }
1081  
1082  /** 
1083   * Stop services common to both active and standby states
1084   */
1085  void stopCommonServices() {
1086    writeLock();
1087    if (inodeAttributeProvider != null) {
1088      dir.setINodeAttributeProvider(null);
1089      inodeAttributeProvider.stop();
1090    }
1091    try {
1092      if (blockManager != null) blockManager.close();
1093    } finally {
1094      writeUnlock("stopCommonServices");
1095    }
1096    RetryCache.clear(retryCache);
1097  }
1098  
1099  /**
1100   * Start services required in active state
1101   * @throws IOException
1102   */
1103  void startActiveServices() throws IOException {
1104    startingActiveService = true;
1105    LOG.info("Starting services required for active state");
1106    writeLock();
1107    try {
1108      FSEditLog editLog = getFSImage().getEditLog();
1109      
1110      if (!editLog.isOpenForWrite()) {
1111        // During startup, we're already open for write during initialization.
1112        editLog.initJournalsForWrite();
1113        // May need to recover
1114        editLog.recoverUnclosedStreams();
1115        
1116        LOG.info("Catching up to latest edits from old active before " +
1117            "taking over writer role in edits logs");
1118        editLogTailer.catchupDuringFailover();
1119        
1120        blockManager.setPostponeBlocksFromFuture(false);
1121        blockManager.getDatanodeManager().markAllDatanodesStale();
1122        blockManager.clearQueues();
1123        blockManager.processAllPendingDNMessages();
1124
1125        // Only need to re-process the queue, If not in SafeMode.
1126        if (!isInSafeMode()) {
1127          LOG.info("Reprocessing replication and invalidation queues");
1128          initializeReplQueues();
1129        }
1130
1131        if (LOG.isDebugEnabled()) {
1132          LOG.debug("NameNode metadata after re-processing " +
1133              "replication and invalidation queues during failover:\n" +
1134              metaSaveAsString());
1135        }
1136        
1137        long nextTxId = getFSImage().getLastAppliedTxId() + 1;
1138        LOG.info("Will take over writing edit logs at txnid " + 
1139            nextTxId);
1140        editLog.setNextTxId(nextTxId);
1141
1142        getFSImage().editLog.openForWrite();
1143      }
1144
1145      // Enable quota checks.
1146      dir.enableQuotaChecks();
1147      if (haEnabled) {
1148        // Renew all of the leases before becoming active.
1149        // This is because, while we were in standby mode,
1150        // the leases weren't getting renewed on this NN.
1151        // Give them all a fresh start here.
1152        leaseManager.renewAllLeases();
1153      }
1154      leaseManager.startMonitor();
1155      startSecretManagerIfNecessary();
1156
1157      //ResourceMonitor required only at ActiveNN. See HDFS-2914
1158      this.nnrmthread = new Daemon(new NameNodeResourceMonitor());
1159      nnrmthread.start();
1160
1161      nnEditLogRoller = new Daemon(new NameNodeEditLogRoller(
1162          editLogRollerThreshold, editLogRollerInterval));
1163      nnEditLogRoller.start();
1164
1165      if (lazyPersistFileScrubIntervalSec > 0) {
1166        lazyPersistFileScrubber = new Daemon(new LazyPersistFileScrubber(
1167            lazyPersistFileScrubIntervalSec));
1168        lazyPersistFileScrubber.start();
1169      }
1170
1171      cacheManager.startMonitorThread();
1172      blockManager.getDatanodeManager().setShouldSendCachingCommands(true);
1173    } finally {
1174      startingActiveService = false;
1175      checkSafeMode();
1176      writeUnlock("startActiveServices");
1177    }
1178  }
1179
1180  /**
1181   * Initialize replication queues.
1182   */
1183  private void initializeReplQueues() {
1184    LOG.info("initializing replication queues");
1185    blockManager.processMisReplicatedBlocks();
1186    initializedReplQueues = true;
1187  }
1188
1189  private boolean inActiveState() {
1190    return haContext != null &&
1191        haContext.getState().getServiceState() == HAServiceState.ACTIVE;
1192  }
1193
1194  /**
1195   * @return Whether the namenode is transitioning to active state and is in the
1196   *         middle of the {@link #startActiveServices()}
1197   */
1198  public boolean inTransitionToActive() {
1199    return haEnabled && inActiveState() && startingActiveService;
1200  }
1201
1202  private boolean shouldUseDelegationTokens() {
1203    return UserGroupInformation.isSecurityEnabled() ||
1204      alwaysUseDelegationTokensForTests;
1205  }
1206
1207  /** 
1208   * Stop services required in active state
1209   */
1210  void stopActiveServices() {
1211    LOG.info("Stopping services started for active state");
1212    writeLock();
1213    try {
1214      stopSecretManager();
1215      leaseManager.stopMonitor();
1216      if (nnrmthread != null) {
1217        ((NameNodeResourceMonitor) nnrmthread.getRunnable()).stopMonitor();
1218        nnrmthread.interrupt();
1219      }
1220      if (nnEditLogRoller != null) {
1221        ((NameNodeEditLogRoller)nnEditLogRoller.getRunnable()).stop();
1222        nnEditLogRoller.interrupt();
1223      }
1224      if (lazyPersistFileScrubber != null) {
1225        ((LazyPersistFileScrubber) lazyPersistFileScrubber.getRunnable()).stop();
1226        lazyPersistFileScrubber.interrupt();
1227      }
1228      if (dir != null && getFSImage() != null) {
1229        if (getFSImage().editLog != null) {
1230          getFSImage().editLog.close();
1231        }
1232        // Update the fsimage with the last txid that we wrote
1233        // so that the tailer starts from the right spot.
1234        getFSImage().updateLastAppliedTxIdFromWritten();
1235      }
1236      if (cacheManager != null) {
1237        cacheManager.stopMonitorThread();
1238        cacheManager.clearDirectiveStats();
1239      }
1240      blockManager.getDatanodeManager().clearPendingCachingCommands();
1241      blockManager.getDatanodeManager().setShouldSendCachingCommands(false);
1242      // Don't want to keep replication queues when not in Active.
1243      blockManager.clearQueues();
1244      initializedReplQueues = false;
1245    } finally {
1246      writeUnlock("stopActiveServices");
1247    }
1248  }
1249  
1250  /**
1251   * Start services required in standby state 
1252   * 
1253   * @throws IOException
1254   */
1255  void startStandbyServices(final Configuration conf) throws IOException {
1256    LOG.info("Starting services required for standby state");
1257    if (!getFSImage().editLog.isOpenForRead()) {
1258      // During startup, we're already open for read.
1259      getFSImage().editLog.initSharedJournalsForRead();
1260    }
1261    
1262    blockManager.setPostponeBlocksFromFuture(true);
1263
1264    // Disable quota checks while in standby.
1265    dir.disableQuotaChecks();
1266    editLogTailer = new EditLogTailer(this, conf);
1267    editLogTailer.start();
1268    if (standbyShouldCheckpoint) {
1269      standbyCheckpointer = new StandbyCheckpointer(conf, this);
1270      standbyCheckpointer.start();
1271    }
1272  }
1273
1274  /**
1275   * Called when the NN is in Standby state and the editlog tailer tails the
1276   * OP_ROLLING_UPGRADE_START.
1277   */
1278  void triggerRollbackCheckpoint() {
1279    setNeedRollbackFsImage(true);
1280    if (standbyCheckpointer != null) {
1281      standbyCheckpointer.triggerRollbackCheckpoint();
1282    }
1283  }
1284
1285  /**
1286   * Called while the NN is in Standby state, but just about to be
1287   * asked to enter Active state. This cancels any checkpoints
1288   * currently being taken.
1289   */
1290  void prepareToStopStandbyServices() throws ServiceFailedException {
1291    if (standbyCheckpointer != null) {
1292      standbyCheckpointer.cancelAndPreventCheckpoints(
1293          "About to leave standby state");
1294    }
1295  }
1296
1297  /** Stop services required in standby state */
1298  void stopStandbyServices() throws IOException {
1299    LOG.info("Stopping services started for standby state");
1300    if (standbyCheckpointer != null) {
1301      standbyCheckpointer.stop();
1302    }
1303    if (editLogTailer != null) {
1304      editLogTailer.stop();
1305    }
1306    if (dir != null && getFSImage() != null && getFSImage().editLog != null) {
1307      getFSImage().editLog.close();
1308    }
1309  }
1310  
1311  @Override
1312  public void checkOperation(OperationCategory op) throws StandbyException {
1313    if (haContext != null) {
1314      // null in some unit tests
1315      haContext.checkOperation(op);
1316    }
1317  }
1318  
1319  /**
1320   * @throws RetriableException
1321   *           If 1) The NameNode is in SafeMode, 2) HA is enabled, and 3)
1322   *           NameNode is in active state
1323   * @throws SafeModeException
1324   *           Otherwise if NameNode is in SafeMode.
1325   */
1326  void checkNameNodeSafeMode(String errorMsg)
1327      throws RetriableException, SafeModeException {
1328    if (isInSafeMode()) {
1329      SafeModeException se = new SafeModeException(errorMsg, safeMode);
1330      if (haEnabled && haContext != null
1331          && haContext.getState().getServiceState() == HAServiceState.ACTIVE
1332          && shouldRetrySafeMode(this.safeMode)) {
1333        throw new RetriableException(se);
1334      } else {
1335        throw se;
1336      }
1337    }
1338  }
1339
1340  boolean isPermissionEnabled() {
1341    return isPermissionEnabled;
1342  }
1343
1344  /**
1345   * We already know that the safemode is on. We will throw a RetriableException
1346   * if the safemode is not manual or caused by low resource.
1347   */
1348  private boolean shouldRetrySafeMode(SafeModeInfo safeMode) {
1349    if (safeMode == null) {
1350      return false;
1351    } else {
1352      return !safeMode.isManual() && !safeMode.areResourcesLow();
1353    }
1354  }
1355  
1356  public static Collection<URI> getNamespaceDirs(Configuration conf) {
1357    return getStorageDirs(conf, DFS_NAMENODE_NAME_DIR_KEY);
1358  }
1359
1360  /**
1361   * Get all edits dirs which are required. If any shared edits dirs are
1362   * configured, these are also included in the set of required dirs.
1363   * 
1364   * @param conf the HDFS configuration.
1365   * @return all required dirs.
1366   */
1367  public static Collection<URI> getRequiredNamespaceEditsDirs(Configuration conf) {
1368    Set<URI> ret = new HashSet<URI>();
1369    ret.addAll(getStorageDirs(conf, DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY));
1370    ret.addAll(getSharedEditsDirs(conf));
1371    return ret;
1372  }
1373
1374  private static Collection<URI> getStorageDirs(Configuration conf,
1375                                                String propertyName) {
1376    Collection<String> dirNames = conf.getTrimmedStringCollection(propertyName);
1377    StartupOption startOpt = NameNode.getStartupOption(conf);
1378    if(startOpt == StartupOption.IMPORT) {
1379      // In case of IMPORT this will get rid of default directories 
1380      // but will retain directories specified in hdfs-site.xml
1381      // When importing image from a checkpoint, the name-node can
1382      // start with empty set of storage directories.
1383      Configuration cE = new HdfsConfiguration(false);
1384      cE.addResource("core-default.xml");
1385      cE.addResource("core-site.xml");
1386      cE.addResource("hdfs-default.xml");
1387      Collection<String> dirNames2 = cE.getTrimmedStringCollection(propertyName);
1388      dirNames.removeAll(dirNames2);
1389      if(dirNames.isEmpty())
1390        LOG.warn("!!! WARNING !!!" +
1391          "\n\tThe NameNode currently runs without persistent storage." +
1392          "\n\tAny changes to the file system meta-data may be lost." +
1393          "\n\tRecommended actions:" +
1394          "\n\t\t- shutdown and restart NameNode with configured \"" 
1395          + propertyName + "\" in hdfs-site.xml;" +
1396          "\n\t\t- use Backup Node as a persistent and up-to-date storage " +
1397          "of the file system meta-data.");
1398    } else if (dirNames.isEmpty()) {
1399      dirNames = Collections.singletonList(
1400          DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_DEFAULT);
1401    }
1402    return Util.stringCollectionAsURIs(dirNames);
1403  }
1404
1405  /**
1406   * Return an ordered list of edits directories to write to.
1407   * The list is ordered such that all shared edits directories
1408   * are ordered before non-shared directories, and any duplicates
1409   * are removed. The order they are specified in the configuration
1410   * is retained.
1411   * @return Collection of shared edits directories.
1412   * @throws IOException if multiple shared edits directories are configured
1413   */
1414  public static List<URI> getNamespaceEditsDirs(Configuration conf)
1415      throws IOException {
1416    return getNamespaceEditsDirs(conf, true);
1417  }
1418  
1419  public static List<URI> getNamespaceEditsDirs(Configuration conf,
1420      boolean includeShared)
1421      throws IOException {
1422    // Use a LinkedHashSet so that order is maintained while we de-dup
1423    // the entries.
1424    LinkedHashSet<URI> editsDirs = new LinkedHashSet<URI>();
1425    
1426    if (includeShared) {
1427      List<URI> sharedDirs = getSharedEditsDirs(conf);
1428  
1429      // Fail until multiple shared edits directories are supported (HDFS-2782)
1430      if (sharedDirs.size() > 1) {
1431        throw new IOException(
1432            "Multiple shared edits directories are not yet supported");
1433      }
1434  
1435      // First add the shared edits dirs. It's critical that the shared dirs
1436      // are added first, since JournalSet syncs them in the order they are listed,
1437      // and we need to make sure all edits are in place in the shared storage
1438      // before they are replicated locally. See HDFS-2874.
1439      for (URI dir : sharedDirs) {
1440        if (!editsDirs.add(dir)) {
1441          LOG.warn("Edits URI " + dir + " listed multiple times in " + 
1442              DFS_NAMENODE_SHARED_EDITS_DIR_KEY + ". Ignoring duplicates.");
1443        }
1444      }
1445    }    
1446    // Now add the non-shared dirs.
1447    for (URI dir : getStorageDirs(conf, DFS_NAMENODE_EDITS_DIR_KEY)) {
1448      if (!editsDirs.add(dir)) {
1449        LOG.warn("Edits URI " + dir + " listed multiple times in " + 
1450            DFS_NAMENODE_SHARED_EDITS_DIR_KEY + " and " +
1451            DFS_NAMENODE_EDITS_DIR_KEY + ". Ignoring duplicates.");
1452      }
1453    }
1454
1455    if (editsDirs.isEmpty()) {
1456      // If this is the case, no edit dirs have been explicitly configured.
1457      // Image dirs are to be used for edits too.
1458      return Lists.newArrayList(getNamespaceDirs(conf));
1459    } else {
1460      return Lists.newArrayList(editsDirs);
1461    }
1462  }
1463  
1464  /**
1465   * Returns edit directories that are shared between primary and secondary.
1466   * @param conf configuration
1467   * @return collection of edit directories from {@code conf}
1468   */
1469  public static List<URI> getSharedEditsDirs(Configuration conf) {
1470    // don't use getStorageDirs here, because we want an empty default
1471    // rather than the dir in /tmp
1472    Collection<String> dirNames = conf.getTrimmedStringCollection(
1473        DFS_NAMENODE_SHARED_EDITS_DIR_KEY);
1474    return Util.stringCollectionAsURIs(dirNames);
1475  }
1476
1477  @Override
1478  public void readLock() {
1479    this.fsLock.readLock();
1480  }
1481  @Override
1482  public void readUnlock() {
1483    this.fsLock.readUnlock();
1484  }
1485  public void readUnlock(String opName) {
1486    this.fsLock.readUnlock(opName);
1487  }
1488  @Override
1489  public void writeLock() {
1490    this.fsLock.writeLock();
1491  }
1492  @Override
1493  public void writeLockInterruptibly() throws InterruptedException {
1494    this.fsLock.writeLockInterruptibly();
1495  }
1496  @Override
1497  public void writeUnlock() {
1498    this.fsLock.writeUnlock();
1499  }
1500  public void writeUnlock(String opName) {
1501    this.fsLock.writeUnlock(opName);
1502  }
1503  @Override
1504  public boolean hasWriteLock() {
1505    return this.fsLock.isWriteLockedByCurrentThread();
1506  }
1507  @Override
1508  public boolean hasReadLock() {
1509    return this.fsLock.getReadHoldCount() > 0 || hasWriteLock();
1510  }
1511
1512  public int getReadHoldCount() {
1513    return this.fsLock.getReadHoldCount();
1514  }
1515
1516  public int getWriteHoldCount() {
1517    return this.fsLock.getWriteHoldCount();
1518  }
1519
1520  /** Lock the checkpoint lock */
1521  public void cpLock() {
1522    this.cpLock.lock();
1523  }
1524
1525  /** Lock the checkpoint lock interrupibly */
1526  public void cpLockInterruptibly() throws InterruptedException {
1527    this.cpLock.lockInterruptibly();
1528  }
1529
1530  /** Unlock the checkpoint lock */
1531  public void cpUnlock() {
1532    this.cpLock.unlock();
1533  }
1534    
1535
1536  NamespaceInfo getNamespaceInfo() {
1537    readLock();
1538    try {
1539      return unprotectedGetNamespaceInfo();
1540    } finally {
1541      readUnlock("getNamespaceInfo");
1542    }
1543  }
1544
1545  /**
1546   * Version of @see #getNamespaceInfo() that is not protected by a lock.
1547   */
1548  NamespaceInfo unprotectedGetNamespaceInfo() {
1549    return new NamespaceInfo(getFSImage().getStorage().getNamespaceID(),
1550        getClusterId(), getBlockPoolId(),
1551        getFSImage().getStorage().getCTime());
1552  }
1553
1554  /**
1555   * Close down this file system manager.
1556   * Causes heartbeat and lease daemons to stop; waits briefly for
1557   * them to finish, but a short timeout returns control back to caller.
1558   */
1559  void close() {
1560    fsRunning = false;
1561    try {
1562      stopCommonServices();
1563      if (smmthread != null) smmthread.interrupt();
1564    } finally {
1565      // using finally to ensure we also wait for lease daemon
1566      try {
1567        stopActiveServices();
1568        stopStandbyServices();
1569      } catch (IOException ie) {
1570      } finally {
1571        IOUtils.cleanup(LOG, dir);
1572        IOUtils.cleanup(LOG, fsImage);
1573      }
1574    }
1575  }
1576
1577  @Override
1578  public boolean isRunning() {
1579    return fsRunning;
1580  }
1581  
1582  @Override
1583  public boolean isInStandbyState() {
1584    if (haContext == null || haContext.getState() == null) {
1585      // We're still starting up. In this case, if HA is
1586      // on for the cluster, we always start in standby. Otherwise
1587      // start in active.
1588      return haEnabled;
1589    }
1590
1591    return HAServiceState.STANDBY == haContext.getState().getServiceState();
1592  }
1593
1594  /**
1595   * Dump all metadata into specified file
1596   */
1597  void metaSave(String filename) throws IOException {
1598    checkSuperuserPrivilege();
1599    checkOperation(OperationCategory.UNCHECKED);
1600    writeLock();
1601    try {
1602      checkOperation(OperationCategory.UNCHECKED);
1603      File file = new File(System.getProperty("hadoop.log.dir"), filename);
1604      PrintWriter out = new PrintWriter(new BufferedWriter(
1605          new OutputStreamWriter(new FileOutputStream(file), Charsets.UTF_8)));
1606      metaSave(out);
1607      out.flush();
1608      out.close();
1609    } finally {
1610      writeUnlock("metaSave");
1611    }
1612  }
1613
1614  private void metaSave(PrintWriter out) {
1615    assert hasWriteLock();
1616    long totalInodes = this.dir.totalInodes();
1617    long totalBlocks = this.getBlocksTotal();
1618    out.println(totalInodes + " files and directories, " + totalBlocks
1619        + " blocks = " + (totalInodes + totalBlocks) + " total");
1620
1621    blockManager.metaSave(out);
1622  }
1623
1624  private String metaSaveAsString() {
1625    StringWriter sw = new StringWriter();
1626    PrintWriter pw = new PrintWriter(sw);
1627    metaSave(pw);
1628    pw.flush();
1629    return sw.toString();
1630  }
1631
1632  FsServerDefaults getServerDefaults() throws StandbyException {
1633    checkOperation(OperationCategory.READ);
1634    return serverDefaults;
1635  }
1636
1637  long getAccessTimePrecision() {
1638    return accessTimePrecision;
1639  }
1640
1641  private boolean isAccessTimeSupported() {
1642    return accessTimePrecision > 0;
1643  }
1644
1645  /////////////////////////////////////////////////////////
1646  //
1647  // These methods are called by HadoopFS clients
1648  //
1649  /////////////////////////////////////////////////////////
1650  /**
1651   * Set permissions for an existing file.
1652   * @throws IOException
1653   */
1654  void setPermission(String src, FsPermission permission) throws IOException {
1655    final String operationName = "setPermission";
1656    HdfsFileStatus auditStat;
1657    checkOperation(OperationCategory.WRITE);
1658    writeLock();
1659    try {
1660      checkOperation(OperationCategory.WRITE);
1661      checkNameNodeSafeMode("Cannot set permission for " + src);
1662      auditStat = FSDirAttrOp.setPermission(dir, src, permission);
1663    } catch (AccessControlException e) {
1664      logAuditEvent(false, operationName, src);
1665      throw e;
1666    } finally {
1667      writeUnlock(operationName);
1668    }
1669    getEditLog().logSync();
1670    logAuditEvent(true, operationName, src, null, auditStat);
1671  }
1672
1673  /**
1674   * Set owner for an existing file.
1675   * @throws IOException
1676   */
1677  void setOwner(String src, String username, String group)
1678      throws IOException {
1679    final String operationName = "setOwner";
1680    HdfsFileStatus auditStat;
1681    checkOperation(OperationCategory.WRITE);
1682    writeLock();
1683    try {
1684      checkOperation(OperationCategory.WRITE);
1685      checkNameNodeSafeMode("Cannot set owner for " + src);
1686      auditStat = FSDirAttrOp.setOwner(dir, src, username, group);
1687    } catch (AccessControlException e) {
1688      logAuditEvent(false, operationName, src);
1689      throw e;
1690    } finally {
1691      writeUnlock(operationName);
1692    }
1693    getEditLog().logSync();
1694    logAuditEvent(true, operationName, src, null, auditStat);
1695  }
1696
1697  static class GetBlockLocationsResult {
1698    final boolean updateAccessTime;
1699    final LocatedBlocks blocks;
1700    boolean updateAccessTime() {
1701      return updateAccessTime;
1702    }
1703    private GetBlockLocationsResult(
1704        boolean updateAccessTime, LocatedBlocks blocks) {
1705      this.updateAccessTime = updateAccessTime;
1706      this.blocks = blocks;
1707    }
1708  }
1709
1710  /**
1711   * Get block locations within the specified range.
1712   * @see ClientProtocol#getBlockLocations(String, long, long)
1713   */
1714  LocatedBlocks getBlockLocations(String clientMachine, String srcArg,
1715      long offset, long length) throws IOException {
1716    final String operationName = "open";
1717    checkOperation(OperationCategory.READ);
1718    GetBlockLocationsResult res = null;
1719    FSPermissionChecker pc = getPermissionChecker();
1720    readLock();
1721    try {
1722      checkOperation(OperationCategory.READ);
1723      res = getBlockLocations(pc, srcArg, offset, length, true, true);
1724    } catch (AccessControlException e) {
1725      logAuditEvent(false, operationName, srcArg);
1726      throw e;
1727    } finally {
1728      readUnlock(operationName);
1729    }
1730
1731    logAuditEvent(true, operationName, srcArg);
1732
1733    if (res.updateAccessTime()) {
1734      String src = srcArg;
1735      writeLock();
1736      final long now = now();
1737      try {
1738        checkOperation(OperationCategory.WRITE);
1739        /**
1740         * Resolve the path again and update the atime only when the file
1741         * exists.
1742         *
1743         * XXX: Races can still occur even after resolving the path again.
1744         * For example:
1745         *
1746         * <ul>
1747         *   <li>Get the block location for "/a/b"</li>
1748         *   <li>Rename "/a/b" to "/c/b"</li>
1749         *   <li>The second resolution still points to "/a/b", which is
1750         *   wrong.</li>
1751         * </ul>
1752         *
1753         * The behavior is incorrect but consistent with the one before
1754         * HDFS-7463. A better fix is to change the edit log of SetTime to
1755         * use inode id instead of a path.
1756         */
1757        final INodesInPath iip = dir.resolvePath(pc, src);
1758        src = iip.getPath();
1759        INode inode = iip.getLastINode();
1760        boolean updateAccessTime = inode != null &&
1761            now > inode.getAccessTime() + getAccessTimePrecision();
1762        if (!isInSafeMode() && updateAccessTime) {
1763          boolean changed = FSDirAttrOp.setTimes(dir,
1764              inode, -1, now, false, iip.getLatestSnapshotId());
1765          if (changed) {
1766            getEditLog().logTimes(src, -1, now);
1767          }
1768        }
1769      } catch (Throwable e) {
1770        LOG.warn("Failed to update the access time of " + src, e);
1771      } finally {
1772        writeUnlock(operationName);
1773      }
1774    }
1775
1776    LocatedBlocks blocks = res.blocks;
1777    if (blocks != null) {
1778      blockManager.getDatanodeManager().sortLocatedBlocks(
1779          clientMachine, blocks.getLocatedBlocks());
1780
1781      // lastBlock is not part of getLocatedBlocks(), might need to sort it too
1782      LocatedBlock lastBlock = blocks.getLastLocatedBlock();
1783      if (lastBlock != null) {
1784        ArrayList<LocatedBlock> lastBlockList = Lists.newArrayList(lastBlock);
1785        blockManager.getDatanodeManager().sortLocatedBlocks(
1786            clientMachine, lastBlockList);
1787      }
1788    }
1789    return blocks;
1790  }
1791
1792  /**
1793   * Get block locations within the specified range.
1794   * @see ClientProtocol#getBlockLocations(String, long, long)
1795   * @throws IOException
1796   */
1797  GetBlockLocationsResult getBlockLocations(
1798      FSPermissionChecker pc, String src, long offset, long length,
1799      boolean needBlockToken, boolean checkSafeMode) throws IOException {
1800    if (offset < 0) {
1801      throw new HadoopIllegalArgumentException(
1802          "Negative offset is not supported. File: " + src);
1803    }
1804    if (length < 0) {
1805      throw new HadoopIllegalArgumentException(
1806          "Negative length is not supported. File: " + src);
1807    }
1808    final GetBlockLocationsResult ret = getBlockLocationsInt(
1809        pc, src, offset, length, needBlockToken);
1810
1811    if (checkSafeMode && isInSafeMode()) {
1812      for (LocatedBlock b : ret.blocks.getLocatedBlocks()) {
1813        // if safemode & no block locations yet then throw safemodeException
1814        if ((b.getLocations() == null) || (b.getLocations().length == 0)) {
1815          SafeModeException se = new SafeModeException(
1816              "Zero blocklocations for " + src, safeMode);
1817          if (haEnabled && haContext != null &&
1818              haContext.getState().getServiceState() == HAServiceState.ACTIVE) {
1819            throw new RetriableException(se);
1820          } else {
1821            throw se;
1822          }
1823        }
1824      }
1825    }
1826    return ret;
1827  }
1828
1829  private GetBlockLocationsResult getBlockLocationsInt(
1830      FSPermissionChecker pc, final String srcArg, long offset, long length,
1831      boolean needBlockToken)
1832      throws IOException {
1833    String src = srcArg;
1834    final INodesInPath iip = dir.resolvePath(pc, src);
1835    src = iip.getPath();
1836    final INodeFile inode = INodeFile.valueOf(iip.getLastINode(), src);
1837    if (isPermissionEnabled) {
1838      dir.checkPathAccess(pc, iip, FsAction.READ);
1839      checkUnreadableBySuperuser(pc, inode, iip.getPathSnapshotId());
1840    }
1841
1842    final long fileSize = iip.isSnapshot()
1843        ? inode.computeFileSize(iip.getPathSnapshotId())
1844        : inode.computeFileSizeNotIncludingLastUcBlock();
1845    boolean isUc = inode.isUnderConstruction();
1846    if (iip.isSnapshot()) {
1847      // if src indicates a snapshot file, we need to make sure the returned
1848      // blocks do not exceed the size of the snapshot file.
1849      length = Math.min(length, fileSize - offset);
1850      isUc = false;
1851    }
1852
1853    final FileEncryptionInfo feInfo =
1854        FSDirectory.isReservedRawName(srcArg) ? null
1855            : dir.getFileEncryptionInfo(inode, iip.getPathSnapshotId(), iip);
1856
1857    final LocatedBlocks blocks = blockManager.createLocatedBlocks(
1858        inode.getBlocks(iip.getPathSnapshotId()), fileSize,
1859        isUc, offset, length, needBlockToken, iip.isSnapshot(), feInfo);
1860
1861    // Set caching information for the located blocks.
1862    for (LocatedBlock lb : blocks.getLocatedBlocks()) {
1863      cacheManager.setCachedLocations(lb);
1864    }
1865
1866    final long now = now();
1867    boolean updateAccessTime = isAccessTimeSupported() && !isInSafeMode()
1868        && !iip.isSnapshot()
1869        && now > inode.getAccessTime() + getAccessTimePrecision();
1870    return new GetBlockLocationsResult(updateAccessTime, blocks);
1871  }
1872
1873  /**
1874   * Moves all the blocks from {@code srcs} and appends them to {@code target}
1875   * To avoid rollbacks we will verify validity of ALL of the args
1876   * before we start actual move.
1877   * 
1878   * This does not support ".inodes" relative path
1879   * @param target target to concat into
1880   * @param srcs file that will be concatenated
1881   * @throws IOException on error
1882   */
1883  void concat(String target, String [] srcs, boolean logRetryCache)
1884      throws IOException {
1885    waitForLoadingFSImage();
1886    final String operationName = "concat";
1887    HdfsFileStatus stat = null;
1888    boolean success = false;
1889    writeLock();
1890    try {
1891      checkOperation(OperationCategory.WRITE);
1892      checkNameNodeSafeMode("Cannot concat " + target);
1893      stat = FSDirConcatOp.concat(dir, target, srcs, logRetryCache);
1894      success = true;
1895    } finally {
1896      writeUnlock(operationName);
1897      if (success) {
1898        getEditLog().logSync();
1899      }
1900      logAuditEvent(success, operationName, Arrays.toString(srcs),
1901          target, stat);
1902    }
1903  }
1904
1905  /**
1906   * stores the modification and access time for this inode. 
1907   * The access time is precise up to an hour. The transaction, if needed, is
1908   * written to the edits log but is not flushed.
1909   */
1910  void setTimes(String src, long mtime, long atime) throws IOException {
1911    final String operationName = "setTimes";
1912    HdfsFileStatus auditStat;
1913    checkOperation(OperationCategory.WRITE);
1914    writeLock();
1915    try {
1916      checkOperation(OperationCategory.WRITE);
1917      checkNameNodeSafeMode("Cannot set times " + src);
1918      auditStat = FSDirAttrOp.setTimes(dir, src, mtime, atime);
1919    } catch (AccessControlException e) {
1920      logAuditEvent(false, operationName, src);
1921      throw e;
1922    } finally {
1923      writeUnlock(operationName);
1924    }
1925    getEditLog().logSync();
1926    logAuditEvent(true, operationName, src, null, auditStat);
1927  }
1928
1929  /**
1930   * Create a symbolic link.
1931   */
1932  @SuppressWarnings("deprecation")
1933  void createSymlink(String target, String link,
1934      PermissionStatus dirPerms, boolean createParent, boolean logRetryCache)
1935      throws IOException {
1936    final String operationName = "createSymlink";
1937    if (!FileSystem.areSymlinksEnabled()) {
1938      throw new UnsupportedOperationException("Symlinks not supported");
1939    }
1940    HdfsFileStatus auditStat = null;
1941    checkOperation(OperationCategory.WRITE);
1942    writeLock();
1943    try {
1944      checkOperation(OperationCategory.WRITE);
1945      checkNameNodeSafeMode("Cannot create symlink " + link);
1946      auditStat = FSDirSymlinkOp.createSymlinkInt(this, target, link, dirPerms,
1947                                                  createParent, logRetryCache);
1948    } catch (AccessControlException e) {
1949      logAuditEvent(false, operationName, link, target, null);
1950      throw e;
1951    } finally {
1952      writeUnlock(operationName);
1953    }
1954    getEditLog().logSync();
1955    logAuditEvent(true, operationName, link, target, auditStat);
1956  }
1957
1958  /**
1959   * Set replication for an existing file.
1960   * 
1961   * The NameNode sets new replication and schedules either replication of 
1962   * under-replicated data blocks or removal of the excessive block copies 
1963   * if the blocks are over-replicated.
1964   * 
1965   * @see ClientProtocol#setReplication(String, short)
1966   * @param src file name
1967   * @param replication new replication
1968   * @return true if successful; 
1969   *         false if file does not exist or is a directory
1970   */
1971  boolean setReplication(final String src, final short replication)
1972      throws IOException {
1973    final String operationName = "setReplication";
1974    boolean success = false;
1975    waitForLoadingFSImage();
1976    checkOperation(OperationCategory.WRITE);
1977    writeLock();
1978    try {
1979      checkOperation(OperationCategory.WRITE);
1980      checkNameNodeSafeMode("Cannot set replication for " + src);
1981      success = FSDirAttrOp.setReplication(dir, blockManager, src, replication);
1982    } catch (AccessControlException e) {
1983      logAuditEvent(false, operationName, src);
1984      throw e;
1985    } finally {
1986      writeUnlock(operationName);
1987    }
1988    if (success) {
1989      getEditLog().logSync();
1990      logAuditEvent(true, operationName, src);
1991    }
1992    return success;
1993  }
1994
1995  /**
1996   * Truncate file to a lower length.
1997   * Truncate cannot be reverted / recovered from as it causes data loss.
1998   * Truncation at block boundary is atomic, otherwise it requires
1999   * block recovery to truncate the last block of the file.
2000   *
2001   * @return true if client does not need to wait for block recovery,
2002   * false if client needs to wait for block recovery.
2003   */
2004  boolean truncate(String src, long newLength,
2005                   String clientName, String clientMachine,
2006                   long mtime)
2007      throws IOException, UnresolvedLinkException {
2008    boolean ret;
2009    try {
2010      ret = truncateInt(src, newLength, clientName, clientMachine, mtime);
2011    } catch (AccessControlException e) {
2012      logAuditEvent(false, "truncate", src);
2013      throw e;
2014    }
2015    return ret;
2016  }
2017
2018  boolean truncateInt(String srcArg, long newLength,
2019                      String clientName, String clientMachine,
2020                      long mtime)
2021      throws IOException, UnresolvedLinkException {
2022    final String operationName = "truncate";
2023    String src = srcArg;
2024    NameNode.stateChangeLog.debug(
2025        "DIR* NameSystem.truncate: src={} newLength={}", src, newLength);
2026    if (newLength < 0) {
2027      throw new HadoopIllegalArgumentException(
2028          "Cannot truncate to a negative file size: " + newLength + ".");
2029    }
2030    HdfsFileStatus stat = null;
2031    FSPermissionChecker pc = getPermissionChecker();
2032    checkOperation(OperationCategory.WRITE);
2033    boolean res;
2034    writeLock();
2035    BlocksMapUpdateInfo toRemoveBlocks = new BlocksMapUpdateInfo();
2036    try {
2037      checkOperation(OperationCategory.WRITE);
2038      checkNameNodeSafeMode("Cannot truncate for " + src);
2039      INodesInPath iip = dir.resolvePath(pc, src);
2040      src = iip.getPath();
2041      res = truncateInternal(src, newLength, clientName,
2042          clientMachine, mtime, pc, toRemoveBlocks);
2043      stat = dir.getAuditFileInfo(dir.getINodesInPath4Write(src, false));
2044    } finally {
2045      writeUnlock(operationName);
2046    }
2047    getEditLog().logSync();
2048    if (!toRemoveBlocks.getToDeleteList().isEmpty()) {
2049      removeBlocks(toRemoveBlocks);
2050      toRemoveBlocks.clear();
2051    }
2052    logAuditEvent(true, operationName, src, null, stat);
2053    return res;
2054  }
2055
2056  /**
2057   * Truncate a file to a given size
2058   * Update the count at each ancestor directory with quota
2059   */
2060  boolean truncateInternal(String src, long newLength,
2061                           String clientName, String clientMachine,
2062                           long mtime, FSPermissionChecker pc,
2063                           BlocksMapUpdateInfo toRemoveBlocks)
2064      throws IOException, UnresolvedLinkException {
2065    assert hasWriteLock();
2066    INodesInPath iip = dir.getINodesInPath4Write(src, true);
2067    if (isPermissionEnabled) {
2068      dir.checkPathAccess(pc, iip, FsAction.WRITE);
2069    }
2070    INodeFile file = INodeFile.valueOf(iip.getLastINode(), src);
2071    final BlockStoragePolicy lpPolicy =
2072        blockManager.getStoragePolicy("LAZY_PERSIST");
2073
2074    if (lpPolicy != null &&
2075        lpPolicy.getId() == file.getStoragePolicyID()) {
2076      throw new UnsupportedOperationException(
2077          "Cannot truncate lazy persist file " + src);
2078    }
2079
2080    // Check if the file is already being truncated with the same length
2081    final BlockInfoContiguous last = file.getLastBlock();
2082    if (last != null && last.getBlockUCState() == BlockUCState.UNDER_RECOVERY) {
2083      final Block truncateBlock
2084          = ((BlockInfoContiguousUnderConstruction)last).getTruncateBlock();
2085      if (truncateBlock != null) {
2086        final long truncateLength = file.computeFileSize(false, false)
2087            + truncateBlock.getNumBytes();
2088        if (newLength == truncateLength) {
2089          return false;
2090        }
2091      }
2092    }
2093
2094    // Opening an existing file for truncate. May need lease recovery.
2095    recoverLeaseInternal(RecoverLeaseOp.TRUNCATE_FILE,
2096        iip, src, clientName, clientMachine, false);
2097    // Truncate length check.
2098    long oldLength = file.computeFileSize();
2099    if(oldLength == newLength) {
2100      return true;
2101    }
2102    if(oldLength < newLength) {
2103      throw new HadoopIllegalArgumentException(
2104          "Cannot truncate to a larger file size. Current size: " + oldLength +
2105              ", truncate size: " + newLength + ".");
2106    }
2107    // Perform INodeFile truncation.
2108    final QuotaCounts delta = new QuotaCounts.Builder().build();
2109    boolean onBlockBoundary = dir.truncate(iip, newLength, toRemoveBlocks,
2110        mtime, delta);
2111    Block truncateBlock = null;
2112    if(!onBlockBoundary) {
2113      // Open file for write, but don't log into edits
2114      long lastBlockDelta = file.computeFileSize() - newLength;
2115      assert lastBlockDelta > 0 : "delta is 0 only if on block bounday";
2116      truncateBlock = prepareFileForTruncate(iip, clientName, clientMachine,
2117          lastBlockDelta, null);
2118    }
2119
2120    // update the quota: use the preferred block size for UC block
2121    dir.writeLock();
2122    try {
2123      dir.updateCountNoQuotaCheck(iip, iip.length() - 1, delta);
2124    } finally {
2125      dir.writeUnlock();
2126    }
2127
2128    getEditLog().logTruncate(src, clientName, clientMachine, newLength, mtime,
2129        truncateBlock);
2130    return onBlockBoundary;
2131  }
2132
2133  /**
2134   * Convert current INode to UnderConstruction.
2135   * Recreate lease.
2136   * Create new block for the truncated copy.
2137   * Schedule truncation of the replicas.
2138   *
2139   * @return the returned block will be written to editLog and passed back into
2140   * this method upon loading.
2141   */
2142  Block prepareFileForTruncate(INodesInPath iip,
2143                               String leaseHolder,
2144                               String clientMachine,
2145                               long lastBlockDelta,
2146                               Block newBlock)
2147      throws IOException {
2148    INodeFile file = iip.getLastINode().asFile();
2149    String src = iip.getPath();
2150    file.recordModification(iip.getLatestSnapshotId());
2151    file.toUnderConstruction(leaseHolder, clientMachine);
2152    assert file.isUnderConstruction() : "inode should be under construction.";
2153    leaseManager.addLease(
2154        file.getFileUnderConstructionFeature().getClientName(), src);
2155    boolean shouldRecoverNow = (newBlock == null);
2156    BlockInfoContiguous oldBlock = file.getLastBlock();
2157    boolean shouldCopyOnTruncate = shouldCopyOnTruncate(file, oldBlock);
2158    if(newBlock == null) {
2159      newBlock = (shouldCopyOnTruncate) ? createNewBlock() :
2160          new Block(oldBlock.getBlockId(), oldBlock.getNumBytes(),
2161              nextGenerationStamp(blockIdManager.isLegacyBlock(oldBlock)));
2162    }
2163
2164    BlockInfoContiguousUnderConstruction truncatedBlockUC;
2165    if(shouldCopyOnTruncate) {
2166      // Add new truncateBlock into blocksMap and
2167      // use oldBlock as a source for copy-on-truncate recovery
2168      truncatedBlockUC = new BlockInfoContiguousUnderConstruction(newBlock,
2169          file.getBlockReplication());
2170      truncatedBlockUC.setNumBytes(oldBlock.getNumBytes() - lastBlockDelta);
2171      truncatedBlockUC.setTruncateBlock(oldBlock);
2172      file.setLastBlock(truncatedBlockUC, blockManager.getStorages(oldBlock));
2173      getBlockManager().addBlockCollection(truncatedBlockUC, file);
2174
2175      NameNode.stateChangeLog.debug(
2176          "BLOCK* prepareFileForTruncate: Scheduling copy-on-truncate to new" +
2177          " size {}  new block {} old block {}", truncatedBlockUC.getNumBytes(),
2178          newBlock, truncatedBlockUC.getTruncateBlock());
2179    } else {
2180      // Use new generation stamp for in-place truncate recovery
2181      blockManager.convertLastBlockToUnderConstruction(file, lastBlockDelta);
2182      oldBlock = file.getLastBlock();
2183      assert !oldBlock.isComplete() : "oldBlock should be under construction";
2184      truncatedBlockUC = (BlockInfoContiguousUnderConstruction) oldBlock;
2185      truncatedBlockUC.setTruncateBlock(new Block(oldBlock));
2186      truncatedBlockUC.getTruncateBlock().setNumBytes(
2187          oldBlock.getNumBytes() - lastBlockDelta);
2188      truncatedBlockUC.getTruncateBlock().setGenerationStamp(
2189          newBlock.getGenerationStamp());
2190
2191      NameNode.stateChangeLog.debug(
2192          "BLOCK* prepareFileForTruncate: {} Scheduling in-place block " +
2193          "truncate to new size {}",
2194          truncatedBlockUC.getTruncateBlock().getNumBytes(), truncatedBlockUC);
2195    }
2196    if (shouldRecoverNow) {
2197      truncatedBlockUC.initializeBlockRecovery(newBlock.getGenerationStamp());
2198    }
2199
2200    return newBlock;
2201  }
2202
2203  /**
2204   * Defines if a replica needs to be copied on truncate or
2205   * can be truncated in place.
2206   */
2207  boolean shouldCopyOnTruncate(INodeFile file, BlockInfoContiguous blk) {
2208    if(!isUpgradeFinalized()) {
2209      return true;
2210    }
2211    if (isRollingUpgrade()) {
2212      return true;
2213    }
2214    return file.isBlockInLatestSnapshot(blk);
2215  }
2216
2217  /**
2218   * Set the storage policy for a file or a directory.
2219   *
2220   * @param src file/directory path
2221   * @param policyName storage policy name
2222   */
2223  void setStoragePolicy(String src, String policyName) throws IOException {
2224    HdfsFileStatus auditStat;
2225    waitForLoadingFSImage();
2226    checkOperation(OperationCategory.WRITE);
2227    final String operationName = "setStoragePolicy";
2228    writeLock();
2229    try {
2230      checkOperation(OperationCategory.WRITE);
2231      checkNameNodeSafeMode("Cannot set storage policy for " + src);
2232      auditStat = FSDirAttrOp.setStoragePolicy(
2233          dir, blockManager, src, policyName);
2234    } catch (AccessControlException e) {
2235      logAuditEvent(false, operationName, src);
2236      throw e;
2237    } finally {
2238      writeUnlock(operationName);
2239    }
2240    getEditLog().logSync();
2241    logAuditEvent(true, operationName, src, null, auditStat);
2242  }
2243
2244  /**
2245   * @return All the existing block storage policies
2246   */
2247  BlockStoragePolicy[] getStoragePolicies() throws IOException {
2248    checkOperation(OperationCategory.READ);
2249    waitForLoadingFSImage();
2250    readLock();
2251    try {
2252      checkOperation(OperationCategory.READ);
2253      return FSDirAttrOp.getStoragePolicies(blockManager);
2254    } finally {
2255      readUnlock("getStoragePolicies");
2256    }
2257  }
2258
2259  long getPreferredBlockSize(String src) throws IOException {
2260    checkOperation(OperationCategory.READ);
2261    readLock();
2262    try {
2263      checkOperation(OperationCategory.READ);
2264      return FSDirAttrOp.getPreferredBlockSize(dir, src);
2265    } finally {
2266      readUnlock("getPreferredBlockSize");
2267    }
2268  }
2269
2270  /**
2271   * If the file is within an encryption zone, select the appropriate 
2272   * CryptoProtocolVersion from the list provided by the client. Since the
2273   * client may be newer, we need to handle unknown versions.
2274   *
2275   * @param zone EncryptionZone of the file
2276   * @param supportedVersions List of supported protocol versions
2277   * @return chosen protocol version
2278   * @throws IOException
2279   */
2280  private CryptoProtocolVersion chooseProtocolVersion(EncryptionZone zone,
2281      CryptoProtocolVersion[] supportedVersions)
2282      throws UnknownCryptoProtocolVersionException, UnresolvedLinkException,
2283        SnapshotAccessControlException {
2284    Preconditions.checkNotNull(zone);
2285    Preconditions.checkNotNull(supportedVersions);
2286    // Right now, we only support a single protocol version,
2287    // so simply look for it in the list of provided options
2288    final CryptoProtocolVersion required = zone.getVersion();
2289
2290    for (CryptoProtocolVersion c : supportedVersions) {
2291      if (c.equals(CryptoProtocolVersion.UNKNOWN)) {
2292        if (LOG.isDebugEnabled()) {
2293          LOG.debug("Ignoring unknown CryptoProtocolVersion provided by " +
2294              "client: " + c.getUnknownValue());
2295        }
2296        continue;
2297      }
2298      if (c.equals(required)) {
2299        return c;
2300      }
2301    }
2302    throw new UnknownCryptoProtocolVersionException(
2303        "No crypto protocol versions provided by the client are supported."
2304            + " Client provided: " + Arrays.toString(supportedVersions)
2305            + " NameNode supports: " + Arrays.toString(CryptoProtocolVersion
2306            .values()));
2307  }
2308
2309  /**
2310   * Invoke KeyProvider APIs to generate an encrypted data encryption key for an
2311   * encryption zone. Should not be called with any locks held.
2312   *
2313   * @param ezKeyName key name of an encryption zone
2314   * @return New EDEK, or null if ezKeyName is null
2315   * @throws IOException
2316   */
2317  private EncryptedKeyVersion generateEncryptedDataEncryptionKey(String
2318      ezKeyName) throws IOException {
2319    if (ezKeyName == null) {
2320      return null;
2321    }
2322    EncryptedKeyVersion edek = null;
2323    try {
2324      edek = provider.generateEncryptedKey(ezKeyName);
2325    } catch (GeneralSecurityException e) {
2326      throw new IOException(e);
2327    }
2328    Preconditions.checkNotNull(edek);
2329    return edek;
2330  }
2331
2332  /**
2333   * Create a new file entry in the namespace.
2334   * 
2335   * For description of parameters and exceptions thrown see
2336   * {@link ClientProtocol#create}, except it returns valid file status upon
2337   * success
2338   */
2339  HdfsFileStatus startFile(String src, PermissionStatus permissions,
2340      String holder, String clientMachine, EnumSet<CreateFlag> flag,
2341      boolean createParent, short replication, long blockSize, 
2342      CryptoProtocolVersion[] supportedVersions, boolean logRetryCache)
2343      throws AccessControlException, SafeModeException,
2344      FileAlreadyExistsException, UnresolvedLinkException,
2345      FileNotFoundException, ParentNotDirectoryException, IOException {
2346
2347    HdfsFileStatus status = null;
2348    try {
2349      status = startFileInt(src, permissions, holder, clientMachine, flag,
2350          createParent, replication, blockSize, supportedVersions,
2351          logRetryCache);
2352    } catch (AccessControlException e) {
2353      logAuditEvent(false, "create", src);
2354      throw e;
2355    }
2356    return status;
2357  }
2358
2359  private HdfsFileStatus startFileInt(final String srcArg,
2360      PermissionStatus permissions, String holder, String clientMachine,
2361      EnumSet<CreateFlag> flag, boolean createParent, short replication,
2362      long blockSize, CryptoProtocolVersion[] supportedVersions,
2363      boolean logRetryCache)
2364      throws AccessControlException, SafeModeException,
2365      FileAlreadyExistsException, UnresolvedLinkException,
2366      FileNotFoundException, ParentNotDirectoryException, IOException {
2367    String src = srcArg;
2368    final String operationName = "create";
2369    if (NameNode.stateChangeLog.isDebugEnabled()) {
2370      StringBuilder builder = new StringBuilder();
2371      builder.append("DIR* NameSystem.startFile: src=" + src
2372              + ", holder=" + holder
2373              + ", clientMachine=" + clientMachine
2374              + ", createParent=" + createParent
2375              + ", replication=" + replication
2376              + ", createFlag=" + flag.toString()
2377              + ", blockSize=" + blockSize);
2378      builder.append(", supportedVersions=");
2379      if (supportedVersions != null) {
2380        builder.append(Arrays.toString(supportedVersions));
2381      } else {
2382        builder.append("null");
2383      }
2384      NameNode.stateChangeLog.debug(builder.toString());
2385    }
2386    if (!DFSUtil.isValidName(src)) {
2387      throw new InvalidPathException(src);
2388    }
2389    blockManager.verifyReplication(src, replication, clientMachine);
2390
2391    boolean skipSync = false;
2392    HdfsFileStatus stat = null;
2393    FSPermissionChecker pc = getPermissionChecker();
2394    if (blockSize < minBlockSize) {
2395      throw new IOException("Specified block size is less than configured" +
2396          " minimum value (" + DFSConfigKeys.DFS_NAMENODE_MIN_BLOCK_SIZE_KEY
2397          + "): " + blockSize + " < " + minBlockSize);
2398    }
2399    boolean create = flag.contains(CreateFlag.CREATE);
2400    boolean overwrite = flag.contains(CreateFlag.OVERWRITE);
2401    boolean isLazyPersist = flag.contains(CreateFlag.LAZY_PERSIST);
2402
2403    waitForLoadingFSImage();
2404
2405    /**
2406     * If the file is in an encryption zone, we optimistically create an
2407     * EDEK for the file by calling out to the configured KeyProvider.
2408     * Since this typically involves doing an RPC, we take the readLock
2409     * initially, then drop it to do the RPC.
2410     * 
2411     * Since the path can flip-flop between being in an encryption zone and not
2412     * in the meantime, we need to recheck the preconditions when we retake the
2413     * lock to do the create. If the preconditions are not met, we throw a
2414     * special RetryStartFileException to ask the DFSClient to try the create
2415     * again later.
2416     */
2417    CryptoProtocolVersion protocolVersion = null;
2418    CipherSuite suite = null;
2419    String ezKeyName = null;
2420    EncryptedKeyVersion edek = null;
2421
2422    if (provider != null) {
2423      readLock();
2424      try {
2425        INodesInPath iip = dir.resolvePathForWrite(pc, src);
2426        src = iip.getPath();
2427        // Nothing to do if the path is not within an EZ
2428        final EncryptionZone zone = dir.getEZForPath(iip);
2429        if (zone != null) {
2430          protocolVersion = chooseProtocolVersion(zone, supportedVersions);
2431          suite = zone.getSuite();
2432          ezKeyName = zone.getKeyName();
2433
2434          Preconditions.checkNotNull(protocolVersion);
2435          Preconditions.checkNotNull(suite);
2436          Preconditions.checkArgument(!suite.equals(CipherSuite.UNKNOWN),
2437              "Chose an UNKNOWN CipherSuite!");
2438          Preconditions.checkNotNull(ezKeyName);
2439        }
2440      } finally {
2441        readUnlock(operationName);
2442      }
2443
2444      Preconditions.checkState(
2445          (suite == null && ezKeyName == null) ||
2446              (suite != null && ezKeyName != null),
2447          "Both suite and ezKeyName should both be null or not null");
2448
2449      // Generate EDEK if necessary while not holding the lock
2450      edek = generateEncryptedDataEncryptionKey(ezKeyName);
2451      EncryptionFaultInjector.getInstance().startFileAfterGenerateKey();
2452    }
2453
2454    // Proceed with the create, using the computed cipher suite and 
2455    // generated EDEK
2456    BlocksMapUpdateInfo toRemoveBlocks = null;
2457    writeLock();
2458    try {
2459      checkOperation(OperationCategory.WRITE);
2460      checkNameNodeSafeMode("Cannot create file" + src);
2461      dir.writeLock();
2462      try {
2463        final INodesInPath iip = dir.resolvePathForWrite(pc, src);
2464        src = iip.getPath();
2465        toRemoveBlocks = startFileInternal(
2466            pc, iip, permissions, holder,
2467            clientMachine, create, overwrite,
2468            createParent, replication, blockSize,
2469            isLazyPersist, suite, protocolVersion, edek,
2470            logRetryCache);
2471        stat = FSDirStatAndListingOp.getFileInfo(
2472            dir, src, false, FSDirectory.isReservedRawName(srcArg));
2473      } finally {
2474        dir.writeUnlock();
2475      }
2476    } catch (StandbyException se) {
2477      skipSync = true;
2478      throw se;
2479    } finally {
2480      writeUnlock(operationName);
2481      // There might be transactions logged while trying to recover the lease.
2482      // They need to be sync'ed even when an exception was thrown.
2483      if (!skipSync) {
2484        getEditLog().logSync();
2485        if (toRemoveBlocks != null) {
2486          removeBlocks(toRemoveBlocks);
2487          toRemoveBlocks.clear();
2488        }
2489      }
2490    }
2491
2492    logAuditEvent(true, operationName, srcArg, null, stat);
2493    return stat;
2494  }
2495
2496  /**
2497   * Create a new file or overwrite an existing file<br>
2498   * 
2499   * Once the file is create the client then allocates a new block with the next
2500   * call using {@link ClientProtocol#addBlock}.
2501   * <p>
2502   * For description of parameters and exceptions thrown see
2503   * {@link ClientProtocol#create}
2504   */
2505  private BlocksMapUpdateInfo startFileInternal(FSPermissionChecker pc, 
2506      INodesInPath iip, PermissionStatus permissions, String holder,
2507      String clientMachine, boolean create, boolean overwrite, 
2508      boolean createParent, short replication, long blockSize, 
2509      boolean isLazyPersist, CipherSuite suite, CryptoProtocolVersion version,
2510      EncryptedKeyVersion edek, boolean logRetryEntry)
2511      throws IOException {
2512    assert hasWriteLock();
2513    // Verify that the destination does not exist as a directory already.
2514    final INode inode = iip.getLastINode();
2515    final String src = iip.getPath();
2516    if (inode != null && inode.isDirectory()) {
2517      throw new FileAlreadyExistsException(src +
2518          " already exists as a directory");
2519    }
2520
2521    final INodeFile myFile = INodeFile.valueOf(inode, src, true);
2522    if (isPermissionEnabled) {
2523      if (overwrite && myFile != null) {
2524        dir.checkPathAccess(pc, iip, FsAction.WRITE);
2525      }
2526      /*
2527       * To overwrite existing file, need to check 'w' permission 
2528       * of parent (equals to ancestor in this case)
2529       */
2530      dir.checkAncestorAccess(pc, iip, FsAction.WRITE);
2531    }
2532    if (!createParent) {
2533      dir.verifyParentDir(iip, src);
2534    }
2535
2536    FileEncryptionInfo feInfo = null;
2537
2538    final EncryptionZone zone = dir.getEZForPath(iip);
2539    if (zone != null) {
2540      // The path is now within an EZ, but we're missing encryption parameters
2541      if (suite == null || edek == null) {
2542        throw new RetryStartFileException();
2543      }
2544      // Path is within an EZ and we have provided encryption parameters.
2545      // Make sure that the generated EDEK matches the settings of the EZ.
2546      final String ezKeyName = zone.getKeyName();
2547      if (!ezKeyName.equals(edek.getEncryptionKeyName())) {
2548        throw new RetryStartFileException();
2549      }
2550      feInfo = new FileEncryptionInfo(suite, version,
2551          edek.getEncryptedKeyVersion().getMaterial(),
2552          edek.getEncryptedKeyIv(),
2553          ezKeyName, edek.getEncryptionKeyVersionName());
2554    }
2555
2556    try {
2557      BlocksMapUpdateInfo toRemoveBlocks = null;
2558      if (myFile == null) {
2559        if (!create) {
2560          throw new FileNotFoundException("Can't overwrite non-existent " +
2561              src + " for client " + clientMachine);
2562        }
2563      } else {
2564        if (overwrite) {
2565          toRemoveBlocks = new BlocksMapUpdateInfo();
2566          List<INode> toRemoveINodes = new ChunkedArrayList<INode>();
2567          long ret = FSDirDeleteOp.delete(dir, iip, toRemoveBlocks,
2568                                          toRemoveINodes, now());
2569          if (ret >= 0) {
2570            iip = INodesInPath.replace(iip, iip.length() - 1, null);
2571            FSDirDeleteOp.incrDeletedFileCount(ret);
2572            removeLeasesAndINodes(src, toRemoveINodes, true);
2573          }
2574        } else {
2575          // If lease soft limit time is expired, recover the lease
2576          recoverLeaseInternal(RecoverLeaseOp.CREATE_FILE,
2577              iip, src, holder, clientMachine, false);
2578          throw new FileAlreadyExistsException(src + " for client " +
2579              clientMachine + " already exists");
2580        }
2581      }
2582
2583      checkFsObjectLimit();
2584      INodeFile newNode = null;
2585
2586      // Always do an implicit mkdirs for parent directory tree.
2587      Map.Entry<INodesInPath, String> parent = FSDirMkdirOp
2588          .createAncestorDirectories(dir, iip, permissions);
2589      if (parent != null) {
2590        iip = dir.addFile(parent.getKey(), parent.getValue(), permissions,
2591            replication, blockSize, holder, clientMachine);
2592        newNode = iip != null ? iip.getLastINode().asFile() : null;
2593      }
2594
2595      if (newNode == null) {
2596        throw new IOException("Unable to add " + src +  " to namespace");
2597      }
2598      leaseManager.addLease(newNode.getFileUnderConstructionFeature()
2599          .getClientName(), src);
2600
2601      // Set encryption attributes if necessary
2602      if (feInfo != null) {
2603        dir.setFileEncryptionInfo(src, feInfo);
2604        newNode = dir.getInode(newNode.getId()).asFile();
2605      }
2606
2607      setNewINodeStoragePolicy(newNode, iip, isLazyPersist);
2608
2609      // record file record in log, record new generation stamp
2610      getEditLog().logOpenFile(src, newNode, overwrite, logRetryEntry);
2611      NameNode.stateChangeLog.debug("DIR* NameSystem.startFile: added {}" +
2612          " inode {} holder {}", src, newNode.getId(), holder);
2613      return toRemoveBlocks;
2614    } catch (IOException ie) {
2615      NameNode.stateChangeLog.warn("DIR* NameSystem.startFile: " + src + " " +
2616          ie.getMessage());
2617      throw ie;
2618    }
2619  }
2620
2621  private void setNewINodeStoragePolicy(INodeFile inode,
2622                                        INodesInPath iip,
2623                                        boolean isLazyPersist)
2624      throws IOException {
2625
2626    if (isLazyPersist) {
2627      BlockStoragePolicy lpPolicy =
2628          blockManager.getStoragePolicy("LAZY_PERSIST");
2629
2630      // Set LAZY_PERSIST storage policy if the flag was passed to
2631      // CreateFile.
2632      if (lpPolicy == null) {
2633        throw new HadoopIllegalArgumentException(
2634            "The LAZY_PERSIST storage policy has been disabled " +
2635            "by the administrator.");
2636      }
2637      inode.setStoragePolicyID(lpPolicy.getId(),
2638                                 iip.getLatestSnapshotId());
2639    } else {
2640      BlockStoragePolicy effectivePolicy =
2641          blockManager.getStoragePolicy(inode.getStoragePolicyID());
2642
2643      if (effectivePolicy != null &&
2644          effectivePolicy.isCopyOnCreateFile()) {
2645        // Copy effective policy from ancestor directory to current file.
2646        inode.setStoragePolicyID(effectivePolicy.getId(),
2647                                 iip.getLatestSnapshotId());
2648      }
2649    }
2650  }
2651
2652  /**
2653   * Append to an existing file for append.
2654   * <p>
2655   * 
2656   * The method returns the last block of the file if this is a partial block,
2657   * which can still be used for writing more data. The client uses the returned
2658   * block locations to form the data pipeline for this block.<br>
2659   * The method returns null if the last block is full. The client then
2660   * allocates a new block with the next call using
2661   * {@link ClientProtocol#addBlock}.
2662   * <p>
2663   * 
2664   * For description of parameters and exceptions thrown see
2665   * {@link ClientProtocol#append(String, String, EnumSetWritable)}
2666   *
2667   * @return the last block locations if the block is partial or null otherwise
2668   */
2669  private LocatedBlock appendFileInternal(FSPermissionChecker pc,
2670      INodesInPath iip, String holder, String clientMachine, boolean newBlock,
2671      boolean logRetryCache) throws IOException {
2672    assert hasWriteLock();
2673    // Verify that the destination does not exist as a directory already.
2674    final INode inode = iip.getLastINode();
2675    final String src = iip.getPath();
2676    if (inode != null && inode.isDirectory()) {
2677      throw new FileAlreadyExistsException("Cannot append to directory " + src
2678          + "; already exists as a directory.");
2679    }
2680    if (isPermissionEnabled) {
2681      dir.checkPathAccess(pc, iip, FsAction.WRITE);
2682    }
2683
2684    try {
2685      if (inode == null) {
2686        throw new FileNotFoundException("failed to append to non-existent file "
2687          + src + " for client " + clientMachine);
2688      }
2689      INodeFile myFile = INodeFile.valueOf(inode, src, true);
2690      final BlockStoragePolicy lpPolicy =
2691          blockManager.getStoragePolicy("LAZY_PERSIST");
2692      if (lpPolicy != null &&
2693          lpPolicy.getId() == myFile.getStoragePolicyID()) {
2694        throw new UnsupportedOperationException(
2695            "Cannot append to lazy persist file " + src);
2696      }
2697      // Opening an existing file for append - may need to recover lease.
2698      recoverLeaseInternal(RecoverLeaseOp.APPEND_FILE,
2699          iip, src, holder, clientMachine, false);
2700      
2701      final BlockInfoContiguous lastBlock = myFile.getLastBlock();
2702      // Check that the block has at least minimum replication.
2703      if(lastBlock != null && lastBlock.isComplete() &&
2704          !getBlockManager().isSufficientlyReplicated(lastBlock)) {
2705        throw new IOException("append: lastBlock=" + lastBlock +
2706            " of src=" + src + " is not sufficiently replicated yet.");
2707      }
2708      return prepareFileForAppend(src, iip, holder, clientMachine, newBlock,
2709          true, logRetryCache);
2710    } catch (IOException ie) {
2711      NameNode.stateChangeLog.warn("DIR* NameSystem.append: " +ie.getMessage());
2712      throw ie;
2713    }
2714  }
2715  
2716  /**
2717   * Convert current node to under construction.
2718   * Recreate in-memory lease record.
2719   * 
2720   * @param src path to the file
2721   * @param leaseHolder identifier of the lease holder on this file
2722   * @param clientMachine identifier of the client machine
2723   * @param newBlock if the data is appended to a new block
2724   * @param writeToEditLog whether to persist this change to the edit log
2725   * @param logRetryCache whether to record RPC ids in editlog for retry cache
2726   *                      rebuilding
2727   * @return the last block locations if the block is partial or null otherwise
2728   * @throws UnresolvedLinkException
2729   * @throws IOException
2730   */
2731  LocatedBlock prepareFileForAppend(String src, INodesInPath iip,
2732      String leaseHolder, String clientMachine, boolean newBlock,
2733      boolean writeToEditLog, boolean logRetryCache) throws IOException {
2734    final INodeFile file = iip.getLastINode().asFile();
2735    final QuotaCounts delta = verifyQuotaForUCBlock(file, iip);
2736
2737    file.recordModification(iip.getLatestSnapshotId());
2738    file.toUnderConstruction(leaseHolder, clientMachine);
2739
2740    leaseManager.addLease(
2741        file.getFileUnderConstructionFeature().getClientName(), src);
2742
2743    LocatedBlock ret = null;
2744    if (!newBlock) {
2745      ret = blockManager.convertLastBlockToUnderConstruction(file, 0);
2746      if (ret != null && delta != null) {
2747        Preconditions.checkState(delta.getStorageSpace() >= 0,
2748            "appending to a block with size larger than the preferred block size");
2749        dir.writeLock();
2750        try {
2751          dir.updateCountNoQuotaCheck(iip, iip.length() - 1, delta);
2752        } finally {
2753          dir.writeUnlock();
2754        }
2755      }
2756    } else {
2757      BlockInfoContiguous lastBlock = file.getLastBlock();
2758      if (lastBlock != null) {
2759        ExtendedBlock blk = new ExtendedBlock(this.getBlockPoolId(), lastBlock);
2760        ret = new LocatedBlock(blk, new DatanodeInfo[0]);
2761      }
2762    }
2763
2764    if (writeToEditLog) {
2765      getEditLog().logAppendFile(src, file, newBlock, logRetryCache);
2766    }
2767    return ret;
2768  }
2769
2770  /**
2771   * Verify quota when using the preferred block size for UC block. This is
2772   * usually used by append and truncate
2773   * @throws QuotaExceededException when violating the storage quota
2774   * @return expected quota usage update. null means no change or no need to
2775   *         update quota usage later
2776   */
2777  private QuotaCounts verifyQuotaForUCBlock(INodeFile file, INodesInPath iip)
2778      throws QuotaExceededException {
2779    if (!isImageLoaded() || dir.shouldSkipQuotaChecks()) {
2780      // Do not check quota if editlog is still being processed
2781      return null;
2782    }
2783    if (file.getLastBlock() != null) {
2784      final QuotaCounts delta = computeQuotaDeltaForUCBlock(file);
2785      dir.readLock();
2786      try {
2787        FSDirectory.verifyQuota(iip, iip.length() - 1, delta, null);
2788        return delta;
2789      } finally {
2790        dir.readUnlock();
2791      }
2792    }
2793    return null;
2794  }
2795
2796  /** Compute quota change for converting a complete block to a UC block */
2797  private QuotaCounts computeQuotaDeltaForUCBlock(INodeFile file) {
2798    final QuotaCounts delta = new QuotaCounts.Builder().build();
2799    final BlockInfoContiguous lastBlock = file.getLastBlock();
2800    if (lastBlock != null) {
2801      final long diff = file.getPreferredBlockSize() - lastBlock.getNumBytes();
2802      final short repl = file.getBlockReplication();
2803      delta.addStorageSpace(diff * repl);
2804      final BlockStoragePolicy policy = dir.getBlockStoragePolicySuite()
2805          .getPolicy(file.getStoragePolicyID());
2806      List<StorageType> types = policy.chooseStorageTypes(repl);
2807      for (StorageType t : types) {
2808        if (t.supportTypeQuota()) {
2809          delta.addTypeSpace(t, diff);
2810        }
2811      }
2812    }
2813    return delta;
2814  }
2815
2816  /**
2817   * Recover lease;
2818   * Immediately revoke the lease of the current lease holder and start lease
2819   * recovery so that the file can be forced to be closed.
2820   * 
2821   * @param src the path of the file to start lease recovery
2822   * @param holder the lease holder's name
2823   * @param clientMachine the client machine's name
2824   * @return true if the file is already closed or
2825   *         if the lease can be released and the file can be closed.
2826   * @throws IOException
2827   */
2828  boolean recoverLease(String src, String holder, String clientMachine)
2829      throws IOException {
2830    if (!DFSUtil.isValidName(src)) {
2831      throw new IOException("Invalid file name: " + src);
2832    }
2833  
2834    boolean skipSync = false;
2835    FSPermissionChecker pc = getPermissionChecker();
2836    checkOperation(OperationCategory.WRITE);
2837    writeLock();
2838    try {
2839      checkOperation(OperationCategory.WRITE);
2840      checkNameNodeSafeMode("Cannot recover the lease of " + src);
2841      final INodesInPath iip = dir.resolvePathForWrite(pc, src);
2842      src = iip.getPath();
2843      final INodeFile inode = INodeFile.valueOf(iip.getLastINode(), src);
2844      if (!inode.isUnderConstruction()) {
2845        return true;
2846      }
2847      if (isPermissionEnabled) {
2848        dir.checkPathAccess(pc, iip, FsAction.WRITE);
2849      }
2850  
2851      return recoverLeaseInternal(RecoverLeaseOp.RECOVER_LEASE,
2852          iip, src, holder, clientMachine, true);
2853    } catch (StandbyException se) {
2854      skipSync = true;
2855      throw se;
2856    } finally {
2857      writeUnlock("recoverLease");
2858      // There might be transactions logged while trying to recover the lease.
2859      // They need to be sync'ed even when an exception was thrown.
2860      if (!skipSync) {
2861        getEditLog().logSync();
2862      }
2863    }
2864  }
2865
2866  private enum RecoverLeaseOp {
2867    CREATE_FILE,
2868    APPEND_FILE,
2869    TRUNCATE_FILE,
2870    RECOVER_LEASE;
2871    
2872    private String getExceptionMessage(String src, String holder,
2873        String clientMachine, String reason) {
2874      return "Failed to " + this + " " + src + " for " + holder +
2875          " on " + clientMachine + " because " + reason;
2876    }
2877  }
2878
2879  boolean recoverLeaseInternal(RecoverLeaseOp op, INodesInPath iip,
2880      String src, String holder, String clientMachine, boolean force)
2881      throws IOException {
2882    assert hasWriteLock();
2883    INodeFile file = iip.getLastINode().asFile();
2884    if (file.isUnderConstruction()) {
2885      //
2886      // If the file is under construction , then it must be in our
2887      // leases. Find the appropriate lease record.
2888      //
2889      Lease lease = leaseManager.getLease(holder);
2890
2891      if (!force && lease != null) {
2892        Lease leaseFile = leaseManager.getLeaseByPath(src);
2893        if (leaseFile != null && leaseFile.equals(lease)) {
2894          // We found the lease for this file but the original
2895          // holder is trying to obtain it again.
2896          throw new AlreadyBeingCreatedException(
2897              op.getExceptionMessage(src, holder, clientMachine,
2898                  holder + " is already the current lease holder."));
2899        }
2900      }
2901      //
2902      // Find the original holder.
2903      //
2904      FileUnderConstructionFeature uc = file.getFileUnderConstructionFeature();
2905      String clientName = uc.getClientName();
2906      lease = leaseManager.getLease(clientName);
2907      if (lease == null) {
2908        throw new AlreadyBeingCreatedException(
2909            op.getExceptionMessage(src, holder, clientMachine,
2910                "the file is under construction but no leases found."));
2911      }
2912      if (force) {
2913        // close now: no need to wait for soft lease expiration and 
2914        // close only the file src
2915        LOG.info("recoverLease: " + lease + ", src=" + src +
2916          " from client " + clientName);
2917        return internalReleaseLease(lease, src, iip, holder);
2918      } else {
2919        assert lease.getHolder().equals(clientName) :
2920          "Current lease holder " + lease.getHolder() +
2921          " does not match file creator " + clientName;
2922        //
2923        // If the original holder has not renewed in the last SOFTLIMIT 
2924        // period, then start lease recovery.
2925        //
2926        if (lease.expiredSoftLimit()) {
2927          LOG.info("startFile: recover " + lease + ", src=" + src + " client "
2928              + clientName);
2929          if (internalReleaseLease(lease, src, iip, null)) {
2930            return true;
2931          } else {
2932            throw new RecoveryInProgressException(
2933                op.getExceptionMessage(src, holder, clientMachine,
2934                    "lease recovery is in progress. Try again later."));
2935          }
2936        } else {
2937          final BlockInfoContiguous lastBlock = file.getLastBlock();
2938          if (lastBlock != null
2939              && lastBlock.getBlockUCState() == BlockUCState.UNDER_RECOVERY) {
2940            throw new RecoveryInProgressException(
2941                op.getExceptionMessage(src, holder, clientMachine,
2942                    "another recovery is in progress by "
2943                        + clientName + " on " + uc.getClientMachine()));
2944          } else {
2945            throw new AlreadyBeingCreatedException(
2946                op.getExceptionMessage(src, holder, clientMachine,
2947                    "this file lease is currently owned by "
2948                        + clientName + " on " + uc.getClientMachine()));
2949          }
2950        }
2951      }
2952    } else {
2953      return true;
2954     }
2955  }
2956
2957  /**
2958   * Append to an existing file in the namespace.
2959   */
2960  LastBlockWithStatus appendFile(String src, String holder,
2961      String clientMachine, EnumSet<CreateFlag> flag, boolean logRetryCache)
2962      throws IOException {
2963    try {
2964      return appendFileInt(src, holder, clientMachine,
2965          flag.contains(CreateFlag.NEW_BLOCK), logRetryCache);
2966    } catch (AccessControlException e) {
2967      logAuditEvent(false, "append", src);
2968      throw e;
2969    }
2970  }
2971
2972  private LastBlockWithStatus appendFileInt(final String srcArg, String holder,
2973      String clientMachine, boolean newBlock, boolean logRetryCache)
2974      throws IOException {
2975    String src = srcArg;
2976    final String operationName = "append";
2977    NameNode.stateChangeLog.debug(
2978        "DIR* NameSystem.appendFile: src={}, holder={}, clientMachine={}",
2979        src, holder, clientMachine);
2980    boolean skipSync = false;
2981    if (!supportAppends) {
2982      throw new UnsupportedOperationException(
2983          "Append is not enabled on this NameNode. Use the " +
2984          DFS_SUPPORT_APPEND_KEY + " configuration option to enable it.");
2985    }
2986
2987    LocatedBlock lb = null;
2988    HdfsFileStatus stat = null;
2989    FSPermissionChecker pc = getPermissionChecker();
2990    writeLock();
2991    try {
2992      checkOperation(OperationCategory.WRITE);
2993      checkNameNodeSafeMode("Cannot append to file" + src);
2994      final INodesInPath iip = dir.resolvePathForWrite(pc, src);
2995      src = iip.getPath();
2996      lb = appendFileInternal(pc, iip, holder, clientMachine, newBlock,
2997          logRetryCache);
2998      stat = FSDirStatAndListingOp.getFileInfo(dir, src, false,
2999          FSDirectory.isReservedRawName(srcArg));
3000    } catch (StandbyException se) {
3001      skipSync = true;
3002      throw se;
3003    } finally {
3004      writeUnlock(operationName);
3005      // There might be transactions logged while trying to recover the lease.
3006      // They need to be sync'ed even when an exception was thrown.
3007      if (!skipSync) {
3008        getEditLog().logSync();
3009      }
3010    }
3011    if (lb != null) {
3012      NameNode.stateChangeLog.debug(
3013          "DIR* NameSystem.appendFile: file {} for {} at {} block {} block" +
3014          " size {}", src, holder, clientMachine, lb.getBlock(),
3015          lb.getBlock().getNumBytes());
3016    }
3017    logAuditEvent(true, operationName, srcArg);
3018    return new LastBlockWithStatus(lb, stat);
3019  }
3020
3021  ExtendedBlock getExtendedBlock(Block blk) {
3022    return new ExtendedBlock(blockPoolId, blk);
3023  }
3024  
3025  void setBlockPoolId(String bpid) {
3026    blockPoolId = bpid;
3027    blockManager.setBlockPoolId(blockPoolId);
3028  }
3029
3030  /**
3031   * The client would like to obtain an additional block for the indicated
3032   * filename (which is being written-to).  Return an array that consists
3033   * of the block, plus a set of machines.  The first on this list should
3034   * be where the client writes data.  Subsequent items in the list must
3035   * be provided in the connection to the first datanode.
3036   *
3037   * Make sure the previous blocks have been reported by datanodes and
3038   * are replicated.  Will return an empty 2-elt array if we want the
3039   * client to "try again later".
3040   */
3041  LocatedBlock getAdditionalBlock(String src, long fileId, String clientName,
3042      ExtendedBlock previous, Set<Node> excludedNodes, 
3043      List<String> favoredNodes) throws IOException {
3044    LocatedBlock[] onRetryBlock = new LocatedBlock[1];
3045    DatanodeStorageInfo targets[] = getNewBlockTargets(src, fileId,
3046        clientName, previous, excludedNodes, favoredNodes, onRetryBlock);
3047    if (targets == null) {
3048      assert onRetryBlock[0] != null : "Retry block is null";
3049      // This is a retry. Just return the last block.
3050      return onRetryBlock[0];
3051    }
3052    LocatedBlock newBlock = storeAllocatedBlock(
3053        src, fileId, clientName, previous, targets);
3054    return newBlock;
3055  }
3056
3057  /**
3058   * Part I of getAdditionalBlock().
3059   * Analyze the state of the file under read lock to determine if the client
3060   * can add a new block, detect potential retries, lease mismatches,
3061   * and minimal replication of the penultimate block.
3062   * 
3063   * Generate target DataNode locations for the new block,
3064   * but do not create the new block yet.
3065   */
3066  DatanodeStorageInfo[] getNewBlockTargets(String src, long fileId,
3067      String clientName, ExtendedBlock previous, Set<Node> excludedNodes,
3068      List<String> favoredNodes, LocatedBlock[] onRetryBlock) throws IOException {
3069    final long blockSize;
3070    final int replication;
3071    final byte storagePolicyID;
3072    Node clientNode = null;
3073    String clientMachine = null;
3074
3075    NameNode.stateChangeLog.debug("BLOCK* getAdditionalBlock: {}  inodeId {}" +
3076        " for {}", src, fileId, clientName);
3077
3078    checkOperation(OperationCategory.READ);
3079    FSPermissionChecker pc = getPermissionChecker();
3080    readLock();
3081    try {
3082      checkOperation(OperationCategory.READ);
3083      INodesInPath iip = dir.resolvePath(pc, src, fileId);
3084      src = iip.getPath();
3085      FileState fileState = analyzeFileState(
3086          iip, fileId, clientName, previous, onRetryBlock);
3087      if (onRetryBlock[0] != null && onRetryBlock[0].getLocations().length > 0) {
3088        // This is a retry. No need to generate new locations.
3089        // Use the last block if it has locations.
3090        return null;
3091      }
3092
3093      final INodeFile pendingFile = fileState.inode;
3094      if (!checkFileProgress(src, pendingFile, false)) {
3095        throw new NotReplicatedYetException("Not replicated yet: " + src);
3096      }
3097      src = fileState.path;
3098
3099      if (pendingFile.getBlocks().length >= maxBlocksPerFile) {
3100        throw new IOException("File has reached the limit on maximum number of"
3101            + " blocks (" + DFSConfigKeys.DFS_NAMENODE_MAX_BLOCKS_PER_FILE_KEY
3102            + "): " + pendingFile.getBlocks().length + " >= "
3103            + maxBlocksPerFile);
3104      }
3105      blockSize = pendingFile.getPreferredBlockSize();
3106      clientMachine = pendingFile.getFileUnderConstructionFeature()
3107          .getClientMachine();
3108      clientNode = blockManager.getDatanodeManager().getDatanodeByHost(
3109          clientMachine);
3110      replication = pendingFile.getFileReplication();
3111      storagePolicyID = pendingFile.getStoragePolicyID();
3112    } finally {
3113      readUnlock("getNewBlockTargets");
3114    }
3115
3116    if (clientNode == null) {
3117      clientNode = getClientNode(clientMachine);
3118    }
3119
3120    // choose targets for the new block to be allocated.
3121    return getBlockManager().chooseTarget4NewBlock( 
3122        src, replication, clientNode, excludedNodes, blockSize, favoredNodes,
3123        storagePolicyID);
3124  }
3125
3126  /**
3127   * Part II of getAdditionalBlock().
3128   * Should repeat the same analysis of the file state as in Part 1,
3129   * but under the write lock.
3130   * If the conditions still hold, then allocate a new block with
3131   * the new targets, add it to the INode and to the BlocksMap.
3132   */
3133  LocatedBlock storeAllocatedBlock(String src, long fileId, String clientName,
3134      ExtendedBlock previous, DatanodeStorageInfo[] targets) throws IOException {
3135    Block newBlock = null;
3136    long offset;
3137    checkOperation(OperationCategory.WRITE);
3138    waitForLoadingFSImage();
3139    writeLock();
3140    try {
3141      checkOperation(OperationCategory.WRITE);
3142      // Run the full analysis again, since things could have changed
3143      // while chooseTarget() was executing.
3144      LocatedBlock[] onRetryBlock = new LocatedBlock[1];
3145      final INodesInPath iip = dir.resolvePath(null, src, fileId);
3146      FileState fileState = 
3147          analyzeFileState(iip, fileId, clientName, previous, onRetryBlock);
3148      final INodeFile pendingFile = fileState.inode;
3149      src = fileState.path;
3150
3151      if (onRetryBlock[0] != null) {
3152        if (onRetryBlock[0].getLocations().length > 0) {
3153          // This is a retry. Just return the last block if having locations.
3154          return onRetryBlock[0];
3155        } else {
3156          // add new chosen targets to already allocated block and return
3157          BlockInfoContiguous lastBlockInFile = pendingFile.getLastBlock();
3158          ((BlockInfoContiguousUnderConstruction) lastBlockInFile)
3159              .setExpectedLocations(targets);
3160          offset = pendingFile.computeFileSize();
3161          return makeLocatedBlock(lastBlockInFile, targets, offset);
3162        }
3163      }
3164
3165      // commit the last block and complete it if it has minimum replicas
3166      commitOrCompleteLastBlock(pendingFile, fileState.iip,
3167                                ExtendedBlock.getLocalBlock(previous));
3168
3169      // allocate new block, record block locations in INode.
3170      newBlock = createNewBlock();
3171      INodesInPath inodesInPath = INodesInPath.fromINode(pendingFile);
3172      saveAllocatedBlock(src, inodesInPath, newBlock, targets);
3173
3174      persistNewBlock(src, pendingFile);
3175      offset = pendingFile.computeFileSize();
3176    } finally {
3177      writeUnlock("storeAllocatedBlock");
3178    }
3179    getEditLog().logSync();
3180
3181    // Return located block
3182    return makeLocatedBlock(newBlock, targets, offset);
3183  }
3184
3185  /*
3186   * Resolve clientmachine address to get a network location path
3187   */
3188  private Node getClientNode(String clientMachine) {
3189    List<String> hosts = new ArrayList<String>(1);
3190    hosts.add(clientMachine);
3191    List<String> rName = getBlockManager().getDatanodeManager()
3192        .resolveNetworkLocation(hosts);
3193    Node clientNode = null;
3194    if (rName != null) {
3195      // Able to resolve clientMachine mapping.
3196      // Create a temp node to findout the rack local nodes
3197      clientNode = new NodeBase(rName.get(0) + NodeBase.PATH_SEPARATOR_STR
3198          + clientMachine);
3199    }
3200    return clientNode;
3201  }
3202
3203  static class FileState {
3204    public final INodeFile inode;
3205    public final String path;
3206    public final INodesInPath iip;
3207
3208    public FileState(INodeFile inode, String fullPath, INodesInPath iip) {
3209      this.inode = inode;
3210      this.path = fullPath;
3211      this.iip = iip;
3212    }
3213  }
3214
3215  private FileState analyzeFileState(
3216      INodesInPath iip, long fileId, String clientName,
3217      ExtendedBlock previous, LocatedBlock[] onRetryBlock)
3218          throws IOException  {
3219    assert hasReadLock();
3220    String src = iip.getPath();
3221    checkBlock(previous);
3222    onRetryBlock[0] = null;
3223    checkNameNodeSafeMode("Cannot add block to " + src);
3224
3225    // have we exceeded the configured limit of fs objects.
3226    checkFsObjectLimit();
3227
3228    Block previousBlock = ExtendedBlock.getLocalBlock(previous);
3229    final INodeFile pendingFile = checkLease(iip, clientName, fileId);
3230    BlockInfoContiguous lastBlockInFile = pendingFile.getLastBlock();
3231    if (!Block.matchingIdAndGenStamp(previousBlock, lastBlockInFile)) {
3232      // The block that the client claims is the current last block
3233      // doesn't match up with what we think is the last block. There are
3234      // four possibilities:
3235      // 1) This is the first block allocation of an append() pipeline
3236      //    which started appending exactly at or exceeding the block boundary.
3237      //    In this case, the client isn't passed the previous block,
3238      //    so it makes the allocateBlock() call with previous=null.
3239      //    We can distinguish this since the last block of the file
3240      //    will be exactly a full block.
3241      // 2) This is a retry from a client that missed the response of a
3242      //    prior getAdditionalBlock() call, perhaps because of a network
3243      //    timeout, or because of an HA failover. In that case, we know
3244      //    by the fact that the client is re-issuing the RPC that it
3245      //    never began to write to the old block. Hence it is safe to
3246      //    to return the existing block.
3247      // 3) This is an entirely bogus request/bug -- we should error out
3248      //    rather than potentially appending a new block with an empty
3249      //    one in the middle, etc
3250      // 4) This is a retry from a client that timed out while
3251      //    the prior getAdditionalBlock() is still being processed,
3252      //    currently working on chooseTarget(). 
3253      //    There are no means to distinguish between the first and 
3254      //    the second attempts in Part I, because the first one hasn't
3255      //    changed the namesystem state yet.
3256      //    We run this analysis again in Part II where case 4 is impossible.
3257
3258      BlockInfoContiguous penultimateBlock = pendingFile.getPenultimateBlock();
3259      if (previous == null &&
3260          lastBlockInFile != null &&
3261          lastBlockInFile.getNumBytes() >= pendingFile.getPreferredBlockSize() &&
3262          lastBlockInFile.isComplete()) {
3263        // Case 1
3264        NameNode.stateChangeLog.debug(
3265            "BLOCK* NameSystem.allocateBlock: handling block allocation" +
3266            " writing to a file with a complete previous block: src={}" +
3267            " lastBlock={}", src, lastBlockInFile);
3268      } else if (Block.matchingIdAndGenStamp(penultimateBlock, previousBlock)) {
3269        if (lastBlockInFile.getNumBytes() != 0) {
3270          throw new IOException(
3271              "Request looked like a retry to allocate block " +
3272              lastBlockInFile + " but it already contains " +
3273              lastBlockInFile.getNumBytes() + " bytes");
3274        }
3275
3276        // Case 2
3277        // Return the last block.
3278        NameNode.stateChangeLog.info("BLOCK* allocateBlock: " +
3279            "caught retry for allocation of a new block in " +
3280            src + ". Returning previously allocated block " + lastBlockInFile);
3281        long offset = pendingFile.computeFileSize();
3282        onRetryBlock[0] = makeLocatedBlock(lastBlockInFile,
3283            ((BlockInfoContiguousUnderConstruction)lastBlockInFile).getExpectedStorageLocations(),
3284            offset);
3285        return new FileState(pendingFile, src, iip);
3286      } else {
3287        // Case 3
3288        throw new IOException("Cannot allocate block in " + src + ": " +
3289            "passed 'previous' block " + previous + " does not match actual " +
3290            "last block in file " + lastBlockInFile);
3291      }
3292    }
3293    return new FileState(pendingFile, src, iip);
3294  }
3295
3296  LocatedBlock makeLocatedBlock(Block blk, DatanodeStorageInfo[] locs,
3297                                        long offset) throws IOException {
3298    LocatedBlock lBlk = new LocatedBlock(
3299        getExtendedBlock(blk), locs, offset, false);
3300    getBlockManager().setBlockToken(
3301        lBlk, BlockTokenSecretManager.AccessMode.WRITE);
3302    return lBlk;
3303  }
3304
3305  /** @see ClientProtocol#getAdditionalDatanode */
3306  LocatedBlock getAdditionalDatanode(String src, long fileId,
3307      final ExtendedBlock blk, final DatanodeInfo[] existings,
3308      final String[] storageIDs,
3309      final Set<Node> excludes,
3310      final int numAdditionalNodes, final String clientName
3311      ) throws IOException {
3312    //check if the feature is enabled
3313    dtpReplaceDatanodeOnFailure.checkEnabled();
3314
3315    Node clientnode = null;
3316    String clientMachine;
3317    final long preferredblocksize;
3318    final byte storagePolicyID;
3319    final List<DatanodeStorageInfo> chosen;
3320    checkOperation(OperationCategory.READ);
3321    FSPermissionChecker pc = getPermissionChecker();
3322    readLock();
3323    try {
3324      checkOperation(OperationCategory.READ);
3325      //check safe mode
3326      checkNameNodeSafeMode("Cannot add datanode; src=" + src + ", blk=" + blk);
3327      final INodesInPath iip = dir.resolvePath(pc, src, fileId);
3328      src = iip.getPath();
3329
3330      //check lease
3331      final INodeFile file = checkLease(iip, clientName, fileId);
3332      clientMachine = file.getFileUnderConstructionFeature().getClientMachine();
3333      clientnode = blockManager.getDatanodeManager().getDatanodeByHost(clientMachine);
3334      preferredblocksize = file.getPreferredBlockSize();
3335      storagePolicyID = file.getStoragePolicyID();
3336
3337      //find datanode storages
3338      final DatanodeManager dm = blockManager.getDatanodeManager();
3339      chosen = Arrays.asList(dm.getDatanodeStorageInfos(existings, storageIDs,
3340          "src=%s, fileId=%d, blk=%s, clientName=%s, clientMachine=%s",
3341          src, fileId, blk, clientName, clientMachine));
3342    } finally {
3343      readUnlock("getAdditionalDatanode");
3344    }
3345
3346    if (clientnode == null) {
3347      clientnode = getClientNode(clientMachine);
3348    }
3349
3350    // choose new datanodes.
3351    final DatanodeStorageInfo[] targets = blockManager.chooseTarget4AdditionalDatanode(
3352        src, numAdditionalNodes, clientnode, chosen, 
3353        excludes, preferredblocksize, storagePolicyID);
3354    final LocatedBlock lb = new LocatedBlock(blk, targets);
3355    blockManager.setBlockToken(lb, AccessMode.COPY);
3356    return lb;
3357  }
3358
3359  /**
3360   * The client would like to let go of the given block
3361   */
3362  boolean abandonBlock(ExtendedBlock b, long fileId, String src, String holder)
3363      throws IOException {
3364    NameNode.stateChangeLog.debug(
3365        "BLOCK* NameSystem.abandonBlock: {} of file {}", b, src);
3366    checkOperation(OperationCategory.WRITE);
3367    FSPermissionChecker pc = getPermissionChecker();
3368    waitForLoadingFSImage();
3369    writeLock();
3370    final INodesInPath iip = dir.resolvePath(pc, src, fileId);
3371    src = iip.getPath();
3372    try {
3373      checkOperation(OperationCategory.WRITE);
3374      checkNameNodeSafeMode("Cannot abandon block " + b + " for file" + src);
3375      final INodeFile file = checkLease(iip, holder, fileId);
3376
3377      // Remove the block from the pending creates list
3378      boolean removed = dir.removeBlock(src, iip, file,
3379          ExtendedBlock.getLocalBlock(b));
3380      if (!removed) {
3381        return true;
3382      }
3383      NameNode.stateChangeLog.debug("BLOCK* NameSystem.abandonBlock: {} is " +
3384          "removed from pendingCreates", b);
3385      persistBlocks(src, file, false);
3386    } finally {
3387      writeUnlock("abandonBlock");
3388    }
3389    getEditLog().logSync();
3390
3391    return true;
3392  }
3393
3394  private INodeFile checkLease(INodesInPath iip, String holder, long fileId)
3395      throws LeaseExpiredException, FileNotFoundException {
3396    String src = iip.getPath();
3397    INode inode = iip.getLastINode();
3398    assert hasReadLock();
3399    final String ident = src + " (inode " + fileId + ")";
3400    if (inode == null) {
3401      Lease lease = leaseManager.getLease(holder);
3402      throw new LeaseExpiredException(
3403          "No lease on " + ident + ": File does not exist. "
3404          + (lease != null ? lease.toString()
3405              : "Holder " + holder + " does not have any open files."));
3406    }
3407    if (!inode.isFile()) {
3408      Lease lease = leaseManager.getLease(holder);
3409      throw new LeaseExpiredException(
3410          "No lease on " + ident + ": INode is not a regular file. "
3411              + (lease != null ? lease.toString()
3412              : "Holder " + holder + " does not have any open files."));
3413    }
3414    final INodeFile file = inode.asFile();
3415    if (!file.isUnderConstruction()) {
3416      Lease lease = leaseManager.getLease(holder);
3417      throw new LeaseExpiredException(
3418          "No lease on " + ident + ": File is not open for writing. "
3419          + (lease != null ? lease.toString()
3420              : "Holder " + holder + " does not have any open files."));
3421    }
3422    // No further modification is allowed on a deleted file.
3423    // A file is considered deleted, if it is not in the inodeMap or is marked
3424    // as deleted in the snapshot feature.
3425    if (isFileDeleted(file)) {
3426      throw new FileNotFoundException(src);
3427    }
3428    String clientName = file.getFileUnderConstructionFeature().getClientName();
3429    if (holder != null && !clientName.equals(holder)) {
3430      throw new LeaseExpiredException("Lease mismatch on " + ident +
3431          " owned by " + clientName + " but is accessed by " + holder);
3432    }
3433    return file;
3434  }
3435 
3436  /**
3437   * Complete in-progress write to the given file.
3438   * @return true if successful, false if the client should continue to retry
3439   *         (e.g if not all blocks have reached minimum replication yet)
3440   * @throws IOException on error (eg lease mismatch, file not open, file deleted)
3441   */
3442  boolean completeFile(final String srcArg, String holder,
3443                       ExtendedBlock last, long fileId)
3444    throws SafeModeException, UnresolvedLinkException, IOException {
3445    String src = srcArg;
3446    NameNode.stateChangeLog.debug("DIR* NameSystem.completeFile: {} for {}",
3447        src, holder);
3448    checkBlock(last);
3449    boolean success = false;
3450    checkOperation(OperationCategory.WRITE);
3451    waitForLoadingFSImage();
3452    writeLock();
3453    try {
3454      checkOperation(OperationCategory.WRITE);
3455      checkNameNodeSafeMode("Cannot complete file " + src);
3456      success = completeFileInternal(src, holder,
3457        ExtendedBlock.getLocalBlock(last), fileId);
3458    } finally {
3459      writeUnlock("completeFile");
3460    }
3461    getEditLog().logSync();
3462    if (success) {
3463      NameNode.stateChangeLog.info("DIR* completeFile: " + srcArg
3464          + " is closed by " + holder);
3465    }
3466    return success;
3467  }
3468
3469  private boolean completeFileInternal(String src, String holder, Block last,
3470      long fileId) throws IOException {
3471    assert hasWriteLock();
3472    final INodeFile pendingFile;
3473    FSPermissionChecker pc = getPermissionChecker();
3474    final INodesInPath iip = dir.resolvePath(pc, src, fileId);
3475    src = iip.getPath();
3476    INode inode = null;
3477    try {
3478      inode = iip.getLastINode();
3479      pendingFile = checkLease(iip, holder, fileId);
3480    } catch (LeaseExpiredException lee) {
3481      if (inode != null && inode.isFile() &&
3482          !inode.asFile().isUnderConstruction()) {
3483        // This could be a retry RPC - i.e the client tried to close
3484        // the file, but missed the RPC response. Thus, it is trying
3485        // again to close the file. If the file still exists and
3486        // the client's view of the last block matches the actual
3487        // last block, then we'll treat it as a successful close.
3488        // See HDFS-3031.
3489        final Block realLastBlock = inode.asFile().getLastBlock();
3490        if (Block.matchingIdAndGenStamp(last, realLastBlock)) {
3491          NameNode.stateChangeLog.info("DIR* completeFile: " +
3492              "request from " + holder + " to complete inode " + fileId +
3493              "(" + src + ") which is already closed. But, it appears to be " +
3494              "an RPC retry. Returning success");
3495          return true;
3496        }
3497      }
3498      throw lee;
3499    }
3500    // Check the state of the penultimate block. It should be completed
3501    // before attempting to complete the last one.
3502    if (!checkFileProgress(src, pendingFile, false)) {
3503      return false;
3504    }
3505
3506    // commit the last block and complete it if it has minimum replicas
3507    commitOrCompleteLastBlock(pendingFile, iip, last);
3508
3509    if (!checkFileProgress(src, pendingFile, true)) {
3510      return false;
3511    }
3512
3513    finalizeINodeFileUnderConstruction(src, pendingFile,
3514        Snapshot.CURRENT_STATE_ID);
3515    return true;
3516  }
3517
3518  /**
3519   * Save allocated block at the given pending filename
3520   * 
3521   * @param src path to the file
3522   * @param inodesInPath representing each of the components of src.
3523   *                     The last INode is the INode for {@code src} file.
3524   * @param newBlock newly allocated block to be save
3525   * @param targets target datanodes where replicas of the new block is placed
3526   * @throws QuotaExceededException If addition of block exceeds space quota
3527   */
3528  BlockInfoContiguous saveAllocatedBlock(String src, INodesInPath inodesInPath,
3529      Block newBlock, DatanodeStorageInfo[] targets)
3530          throws IOException {
3531    assert hasWriteLock();
3532    BlockInfoContiguous b = dir.addBlock(src, inodesInPath, newBlock, targets);
3533    NameNode.stateChangeLog.info("BLOCK* allocate " + b + " for " + src);
3534    DatanodeStorageInfo.incrementBlocksScheduled(targets);
3535    return b;
3536  }
3537
3538  /**
3539   * Create new block with a unique block id and a new generation stamp.
3540   */
3541  Block createNewBlock() throws IOException {
3542    assert hasWriteLock();
3543    Block b = new Block(nextBlockId(), 0, 0);
3544    // Increment the generation stamp for every new block.
3545    b.setGenerationStamp(nextGenerationStamp(false));
3546    return b;
3547  }
3548
3549  /**
3550   * Check that the indicated file's blocks are present and
3551   * replicated.  If not, return false. If checkall is true, then check
3552   * all blocks, otherwise check only penultimate block.
3553   */
3554  boolean checkFileProgress(String src, INodeFile v, boolean checkall) {
3555    if (checkall) {
3556      // check all blocks of the file.
3557      for (BlockInfoContiguous block: v.getBlocks()) {
3558        if (!isCompleteBlock(src, block, blockManager.minReplication)) {
3559          return false;
3560        }
3561      }
3562    } else {
3563      // check the penultimate block of this file
3564      BlockInfoContiguous b = v.getPenultimateBlock();
3565      if (b != null
3566          && !isCompleteBlock(src, b, blockManager.minReplication)) {
3567        return false;
3568      }
3569    }
3570    return true;
3571  }
3572
3573  private static boolean isCompleteBlock(String src, BlockInfoContiguous b, int minRepl) {
3574    if (!b.isComplete()) {
3575      final BlockInfoContiguousUnderConstruction uc = (BlockInfoContiguousUnderConstruction)b;
3576      final int numNodes = b.numNodes();
3577      LOG.info("BLOCK* " + b + " is not COMPLETE (ucState = "
3578          + uc.getBlockUCState() + ", replication# = " + numNodes
3579          + (numNodes < minRepl? " < ": " >= ")
3580          + " minimum = " + minRepl + ") in file " + src);
3581      return false;
3582    }
3583    return true;
3584  }
3585
3586  ////////////////////////////////////////////////////////////////
3587  // Here's how to handle block-copy failure during client write:
3588  // -- As usual, the client's write should result in a streaming
3589  // backup write to a k-machine sequence.
3590  // -- If one of the backup machines fails, no worries.  Fail silently.
3591  // -- Before client is allowed to close and finalize file, make sure
3592  // that the blocks are backed up.  Namenode may have to issue specific backup
3593  // commands to make up for earlier datanode failures.  Once all copies
3594  // are made, edit namespace and return to client.
3595  ////////////////////////////////////////////////////////////////
3596
3597  /** 
3598   * Change the indicated filename. 
3599   * @deprecated Use {@link #renameTo(String, String, boolean,
3600   * Options.Rename...)} instead.
3601   */
3602  @Deprecated
3603  boolean renameTo(String src, String dst, boolean logRetryCache)
3604      throws IOException {
3605    final String operationName = "rename";
3606    waitForLoadingFSImage();
3607    FSDirRenameOp.RenameOldResult ret = null;
3608    writeLock();
3609    try {
3610      checkOperation(OperationCategory.WRITE);
3611      checkNameNodeSafeMode("Cannot rename " + src);
3612      ret = FSDirRenameOp.renameToInt(dir, src, dst, logRetryCache);
3613    } catch (AccessControlException e)  {
3614      logAuditEvent(false, operationName, src, dst, null);
3615      throw e;
3616    } finally {
3617      writeUnlock(operationName);
3618    }
3619    boolean success = ret != null && ret.success;
3620    if (success) {
3621      getEditLog().logSync();
3622    }
3623    logAuditEvent(success, "rename", src, dst,
3624        ret == null ? null : ret.auditStat);
3625    return success;
3626  }
3627
3628  void renameTo(final String src, final String dst,
3629                boolean logRetryCache, Options.Rename... options)
3630      throws IOException {
3631    final String operationName = "rename";
3632    waitForLoadingFSImage();
3633    Map.Entry<BlocksMapUpdateInfo, HdfsFileStatus> res = null;
3634    writeLock();
3635    try {
3636      checkOperation(OperationCategory.WRITE);
3637      checkNameNodeSafeMode("Cannot rename " + src);
3638      res = FSDirRenameOp.renameToInt(dir, src, dst, logRetryCache, options);
3639    } catch (AccessControlException e) {
3640      logAuditEvent(false, operationName + " (options=" +
3641          Arrays.toString(options) + ")", src, dst, null);
3642      throw e;
3643    } finally {
3644      writeUnlock(operationName);
3645    }
3646
3647    getEditLog().logSync();
3648
3649    BlocksMapUpdateInfo collectedBlocks = res.getKey();
3650    HdfsFileStatus auditStat = res.getValue();
3651    if (!collectedBlocks.getToDeleteList().isEmpty()) {
3652      removeBlocks(collectedBlocks);
3653      collectedBlocks.clear();
3654    }
3655
3656    logAuditEvent(true, operationName + " (options=" +
3657        Arrays.toString(options) + ")", src, dst, auditStat);
3658  }
3659
3660  /**
3661   * Remove the indicated file from namespace.
3662   * 
3663   * @see ClientProtocol#delete(String, boolean) for detailed description and 
3664   * description of exceptions
3665   */
3666  boolean delete(String src, boolean recursive, boolean logRetryCache)
3667      throws IOException {
3668    waitForLoadingFSImage();
3669    final String operationName = "delete";
3670    BlocksMapUpdateInfo toRemovedBlocks = null;
3671    writeLock();
3672    boolean ret = false;
3673    try {
3674      checkOperation(OperationCategory.WRITE);
3675      checkNameNodeSafeMode("Cannot delete " + src);
3676      toRemovedBlocks = FSDirDeleteOp.delete(
3677          this, src, recursive, logRetryCache);
3678      ret = toRemovedBlocks != null;
3679    } catch (AccessControlException e) {
3680      logAuditEvent(false, operationName, src);
3681      throw e;
3682    } finally {
3683      writeUnlock(operationName);
3684    }
3685    getEditLog().logSync();
3686    if (toRemovedBlocks != null) {
3687      removeBlocks(toRemovedBlocks); // Incremental deletion of blocks
3688    }
3689    logAuditEvent(true, operationName, src);
3690    return ret;
3691  }
3692
3693  FSPermissionChecker getPermissionChecker()
3694      throws AccessControlException {
3695    return dir.getPermissionChecker();
3696  }
3697
3698  /**
3699   * From the given list, incrementally remove the blocks from blockManager
3700   * Writelock is dropped and reacquired every BLOCK_DELETION_INCREMENT to
3701   * ensure that other waiters on the lock can get in. See HDFS-2938
3702   * 
3703   * @param blocks
3704   *          An instance of {@link BlocksMapUpdateInfo} which contains a list
3705   *          of blocks that need to be removed from blocksMap
3706   */
3707  void removeBlocks(BlocksMapUpdateInfo blocks) {
3708    List<Block> toDeleteList = blocks.getToDeleteList();
3709    Iterator<Block> iter = toDeleteList.iterator();
3710    while (iter.hasNext()) {
3711      writeLock();
3712      try {
3713        for (int i = 0; i < BLOCK_DELETION_INCREMENT && iter.hasNext(); i++) {
3714          blockManager.removeBlock(iter.next());
3715        }
3716      } finally {
3717        writeUnlock("removeBlocks");
3718      }
3719    }
3720  }
3721  
3722  /**
3723   * Remove leases and inodes related to a given path
3724   * @param src The given path
3725   * @param removedINodes Containing the list of inodes to be removed from
3726   *                      inodesMap
3727   * @param acquireINodeMapLock Whether to acquire the lock for inode removal
3728   */
3729  void removeLeasesAndINodes(String src, List<INode> removedINodes,
3730      final boolean acquireINodeMapLock) {
3731    assert hasWriteLock();
3732    leaseManager.removeLeaseWithPrefixPath(src);
3733    // remove inodes from inodesMap
3734    if (removedINodes != null) {
3735      if (acquireINodeMapLock) {
3736        dir.writeLock();
3737      }
3738      try {
3739        dir.removeFromInodeMap(removedINodes);
3740      } finally {
3741        if (acquireINodeMapLock) {
3742          dir.writeUnlock();
3743        }
3744      }
3745      removedINodes.clear();
3746    }
3747  }
3748
3749  /**
3750   * Removes the blocks from blocksmap and updates the safemode blocks total
3751   * 
3752   * @param blocks
3753   *          An instance of {@link BlocksMapUpdateInfo} which contains a list
3754   *          of blocks that need to be removed from blocksMap
3755   */
3756  void removeBlocksAndUpdateSafemodeTotal(BlocksMapUpdateInfo blocks) {
3757    assert hasWriteLock();
3758    // In the case that we are a Standby tailing edits from the
3759    // active while in safe-mode, we need to track the total number
3760    // of blocks and safe blocks in the system.
3761    boolean trackBlockCounts = isSafeModeTrackingBlocks();
3762    int numRemovedComplete = 0, numRemovedSafe = 0;
3763
3764    for (Block b : blocks.getToDeleteList()) {
3765      if (trackBlockCounts) {
3766        BlockInfoContiguous bi = getStoredBlock(b);
3767        if (bi.isComplete()) {
3768          numRemovedComplete++;
3769          if (bi.numNodes() >= blockManager.minReplication) {
3770            numRemovedSafe++;
3771          }
3772        }
3773      }
3774      blockManager.removeBlock(b);
3775    }
3776    if (trackBlockCounts) {
3777      if (LOG.isDebugEnabled()) {
3778        LOG.debug("Adjusting safe-mode totals for deletion."
3779            + "decreasing safeBlocks by " + numRemovedSafe
3780            + ", totalBlocks by " + numRemovedComplete);
3781      }
3782      adjustSafeModeBlockTotals(-numRemovedSafe, -numRemovedComplete);
3783    }
3784  }
3785
3786  /**
3787   * @see SafeModeInfo#shouldIncrementallyTrackBlocks
3788   */
3789  private boolean isSafeModeTrackingBlocks() {
3790    if (!haEnabled) {
3791      // Never track blocks incrementally in non-HA code.
3792      return false;
3793    }
3794    SafeModeInfo sm = this.safeMode;
3795    return sm != null && sm.shouldIncrementallyTrackBlocks();
3796  }
3797
3798  /**
3799   * Get the file info for a specific file.
3800   *
3801   * @param src The string representation of the path to the file
3802   * @param resolveLink whether to throw UnresolvedLinkException
3803   *        if src refers to a symlink
3804   *
3805   * @throws AccessControlException if access is denied
3806   * @throws UnresolvedLinkException if a symlink is encountered.
3807   *
3808   * @return object containing information regarding the file
3809   *         or null if file not found
3810   * @throws StandbyException
3811   */
3812  HdfsFileStatus getFileInfo(final String src, boolean resolveLink)
3813    throws IOException {
3814    final String operationName = "getfileinfo";
3815    checkOperation(OperationCategory.READ);
3816    HdfsFileStatus stat = null;
3817    readLock();
3818    try {
3819      checkOperation(OperationCategory.READ);
3820      stat = FSDirStatAndListingOp.getFileInfo(dir, src, resolveLink);
3821    } catch (AccessControlException e) {
3822      logAuditEvent(false, operationName, src);
3823      throw e;
3824    } finally {
3825      readUnlock(operationName);
3826    }
3827    logAuditEvent(true, operationName, src);
3828    return stat;
3829  }
3830
3831  /**
3832   * Returns true if the file is closed
3833   */
3834  boolean isFileClosed(final String src) throws IOException {
3835    final String operationName = "isFileClosed";
3836    checkOperation(OperationCategory.READ);
3837    readLock();
3838    try {
3839      checkOperation(OperationCategory.READ);
3840      return FSDirStatAndListingOp.isFileClosed(dir, src);
3841    } catch (AccessControlException e) {
3842      logAuditEvent(false, operationName, src);
3843      throw e;
3844    } finally {
3845      readUnlock(operationName);
3846    }
3847  }
3848
3849  /**
3850   * Create all the necessary directories
3851   */
3852  boolean mkdirs(String src, PermissionStatus permissions,
3853      boolean createParent) throws IOException {
3854    final String operationName = "mkdirs";
3855    HdfsFileStatus auditStat = null;
3856    checkOperation(OperationCategory.WRITE);
3857    writeLock();
3858    try {
3859      checkOperation(OperationCategory.WRITE);
3860      checkNameNodeSafeMode("Cannot create directory " + src);
3861      auditStat = FSDirMkdirOp.mkdirs(this, src, permissions, createParent);
3862    } catch (AccessControlException e) {
3863      logAuditEvent(false, operationName, src);
3864      throw e;
3865    } finally {
3866      writeUnlock(operationName);
3867    }
3868    getEditLog().logSync();
3869    logAuditEvent(true, operationName, src, null, auditStat);
3870    return true;
3871  }
3872
3873  /**
3874   * Get the content summary for a specific file/dir.
3875   *
3876   * @param src The string representation of the path to the file
3877   *
3878   * @throws AccessControlException if access is denied
3879   * @throws UnresolvedLinkException if a symlink is encountered.
3880   * @throws FileNotFoundException if no file exists
3881   * @throws StandbyException
3882   * @throws IOException for issues with writing to the audit log
3883   *
3884   * @return object containing information regarding the file
3885   *         or null if file not found
3886   */
3887  ContentSummary getContentSummary(final String src) throws IOException {
3888    checkOperation(OperationCategory.READ);
3889    final String operationName = "contentSummary";
3890    readLock();
3891    boolean success = true;
3892    try {
3893      checkOperation(OperationCategory.READ);
3894      return FSDirStatAndListingOp.getContentSummary(dir, src);
3895    } catch (AccessControlException ace) {
3896      success = false;
3897      throw ace;
3898    } finally {
3899      readUnlock(operationName);
3900      logAuditEvent(success, operationName, src);
3901    }
3902  }
3903
3904  /**
3905   * Set the namespace quota and storage space quota for a directory.
3906   * See {@link ClientProtocol#setQuota(String, long, long, StorageType)} for the
3907   * contract.
3908   * 
3909   * Note: This does not support ".inodes" relative path.
3910   */
3911  void setQuota(String src, long nsQuota, long ssQuota, StorageType type)
3912      throws IOException {
3913    checkOperation(OperationCategory.WRITE);
3914    final String operationName = "setQuota";
3915    writeLock();
3916    boolean success = false;
3917    try {
3918      checkOperation(OperationCategory.WRITE);
3919      checkNameNodeSafeMode("Cannot set quota on " + src);
3920      FSDirAttrOp.setQuota(dir, src, nsQuota, ssQuota, type);
3921      success = true;
3922    } finally {
3923      writeUnlock(operationName);
3924      if (success) {
3925        getEditLog().logSync();
3926      }
3927      logAuditEvent(success, operationName, src);
3928    }
3929  }
3930
3931  /** Persist all metadata about this file.
3932   * @param src The string representation of the path
3933   * @param fileId The inode ID that we're fsyncing.  Older clients will pass
3934   *               INodeId.GRANDFATHER_INODE_ID here.
3935   * @param clientName The string representation of the client
3936   * @param lastBlockLength The length of the last block 
3937   *                        under construction reported from client.
3938   * @throws IOException if path does not exist
3939   */
3940  void fsync(String src, long fileId, String clientName, long lastBlockLength)
3941      throws IOException {
3942    NameNode.stateChangeLog.info("BLOCK* fsync: " + src + " for " + clientName);
3943    checkOperation(OperationCategory.WRITE);
3944
3945    FSPermissionChecker pc = getPermissionChecker();
3946    waitForLoadingFSImage();
3947    writeLock();
3948    try {
3949      checkOperation(OperationCategory.WRITE);
3950      checkNameNodeSafeMode("Cannot fsync file " + src);
3951      INodesInPath iip = dir.resolvePath(pc, src, fileId);
3952      src = iip.getPath();
3953      final INodeFile pendingFile = checkLease(iip, clientName, fileId);
3954      if (lastBlockLength > 0) {
3955        pendingFile.getFileUnderConstructionFeature().updateLengthOfLastBlock(
3956            pendingFile, lastBlockLength);
3957      }
3958      persistBlocks(src, pendingFile, false);
3959    } finally {
3960      writeUnlock("fsync");
3961    }
3962    getEditLog().logSync();
3963  }
3964
3965  /**
3966   * Move a file that is being written to be immutable.
3967   * @param src The filename
3968   * @param lease The lease for the client creating the file
3969   * @param recoveryLeaseHolder reassign lease to this holder if the last block
3970   *        needs recovery; keep current holder if null.
3971   * @throws AlreadyBeingCreatedException if file is waiting to achieve minimal
3972   *         replication;<br>
3973   *         RecoveryInProgressException if lease recovery is in progress.<br>
3974   *         IOException in case of an error.
3975   * @return true  if file has been successfully finalized and closed or 
3976   *         false if block recovery has been initiated. Since the lease owner
3977   *         has been changed and logged, caller should call logSync().
3978   */
3979  boolean internalReleaseLease(Lease lease, String src, INodesInPath iip,
3980      String recoveryLeaseHolder) throws IOException {
3981    LOG.info("Recovering " + lease + ", src=" + src);
3982    assert !isInSafeMode();
3983    assert hasWriteLock();
3984
3985    final INodeFile pendingFile = iip.getLastINode().asFile();
3986    int nrBlocks = pendingFile.numBlocks();
3987    BlockInfoContiguous[] blocks = pendingFile.getBlocks();
3988
3989    int nrCompleteBlocks;
3990    BlockInfoContiguous curBlock = null;
3991    for(nrCompleteBlocks = 0; nrCompleteBlocks < nrBlocks; nrCompleteBlocks++) {
3992      curBlock = blocks[nrCompleteBlocks];
3993      if(!curBlock.isComplete())
3994        break;
3995      assert blockManager.checkMinReplication(curBlock) :
3996              "A COMPLETE block is not minimally replicated in " + src;
3997    }
3998
3999    // If there are no incomplete blocks associated with this file,
4000    // then reap lease immediately and close the file.
4001    if(nrCompleteBlocks == nrBlocks) {
4002      finalizeINodeFileUnderConstruction(src, pendingFile,
4003          iip.getLatestSnapshotId());
4004      NameNode.stateChangeLog.warn("BLOCK*"
4005        + " internalReleaseLease: All existing blocks are COMPLETE,"
4006        + " lease removed, file closed.");
4007      return true;  // closed!
4008    }
4009
4010    // Only the last and the penultimate blocks may be in non COMPLETE state.
4011    // If the penultimate block is not COMPLETE, then it must be COMMITTED.
4012    if(nrCompleteBlocks < nrBlocks - 2 ||
4013       nrCompleteBlocks == nrBlocks - 2 &&
4014         curBlock != null &&
4015         curBlock.getBlockUCState() != BlockUCState.COMMITTED) {
4016      final String message = "DIR* NameSystem.internalReleaseLease: "
4017        + "attempt to release a create lock on "
4018        + src + " but file is already closed.";
4019      NameNode.stateChangeLog.warn(message);
4020      throw new IOException(message);
4021    }
4022
4023    // The last block is not COMPLETE, and
4024    // that the penultimate block if exists is either COMPLETE or COMMITTED
4025    final BlockInfoContiguous lastBlock = pendingFile.getLastBlock();
4026    BlockUCState lastBlockState = lastBlock.getBlockUCState();
4027    BlockInfoContiguous penultimateBlock = pendingFile.getPenultimateBlock();
4028
4029    // If penultimate block doesn't exist then its minReplication is met
4030    boolean penultimateBlockMinReplication = penultimateBlock == null ? true :
4031        blockManager.checkMinReplication(penultimateBlock);
4032
4033    switch(lastBlockState) {
4034    case COMPLETE:
4035      assert false : "Already checked that the last block is incomplete";
4036      break;
4037    case COMMITTED:
4038      // Close file if committed blocks are minimally replicated
4039      if(penultimateBlockMinReplication &&
4040          blockManager.checkMinReplication(lastBlock)) {
4041        finalizeINodeFileUnderConstruction(src, pendingFile,
4042            iip.getLatestSnapshotId());
4043        NameNode.stateChangeLog.warn("BLOCK*"
4044          + " internalReleaseLease: Committed blocks are minimally replicated,"
4045          + " lease removed, file closed.");
4046        return true;  // closed!
4047      }
4048      // Cannot close file right now, since some blocks 
4049      // are not yet minimally replicated.
4050      // This may potentially cause infinite loop in lease recovery
4051      // if there are no valid replicas on data-nodes.
4052      String message = "DIR* NameSystem.internalReleaseLease: " +
4053          "Failed to release lease for file " + src +
4054          ". Committed blocks are waiting to be minimally replicated." +
4055          " Try again later.";
4056      NameNode.stateChangeLog.warn(message);
4057      throw new AlreadyBeingCreatedException(message);
4058    case UNDER_CONSTRUCTION:
4059    case UNDER_RECOVERY:
4060      final BlockInfoContiguousUnderConstruction uc = (BlockInfoContiguousUnderConstruction)lastBlock;
4061      // determine if last block was intended to be truncated
4062      Block recoveryBlock = uc.getTruncateBlock();
4063      boolean truncateRecovery = recoveryBlock != null;
4064      boolean copyOnTruncate = truncateRecovery &&
4065          recoveryBlock.getBlockId() != uc.getBlockId();
4066      assert !copyOnTruncate ||
4067          recoveryBlock.getBlockId() < uc.getBlockId() &&
4068          recoveryBlock.getGenerationStamp() < uc.getGenerationStamp() &&
4069          recoveryBlock.getNumBytes() > uc.getNumBytes() :
4070            "wrong recoveryBlock";
4071
4072      // setup the last block locations from the blockManager if not known
4073      if (uc.getNumExpectedLocations() == 0) {
4074        uc.setExpectedLocations(blockManager.getStorages(lastBlock));
4075      }
4076
4077      if (uc.getNumExpectedLocations() == 0 && uc.getNumBytes() == 0) {
4078        // There is no datanode reported to this block.
4079        // may be client have crashed before writing data to pipeline.
4080        // This blocks doesn't need any recovery.
4081        // We can remove this block and close the file.
4082        pendingFile.removeLastBlock(lastBlock);
4083        finalizeINodeFileUnderConstruction(src, pendingFile,
4084            iip.getLatestSnapshotId());
4085        NameNode.stateChangeLog.warn("BLOCK* internalReleaseLease: "
4086            + "Removed empty last block and closed file.");
4087        return true;
4088      }
4089      // start recovery of the last block for this file
4090      long blockRecoveryId = nextGenerationStamp(blockIdManager.isLegacyBlock(uc));
4091      lease = reassignLease(lease, src, recoveryLeaseHolder, pendingFile);
4092      if(copyOnTruncate) {
4093        uc.setGenerationStamp(blockRecoveryId);
4094      } else if(truncateRecovery) {
4095        recoveryBlock.setGenerationStamp(blockRecoveryId);
4096      }
4097      uc.initializeBlockRecovery(blockRecoveryId);
4098      leaseManager.renewLease(lease);
4099      // Cannot close file right now, since the last block requires recovery.
4100      // This may potentially cause infinite loop in lease recovery
4101      // if there are no valid replicas on data-nodes.
4102      NameNode.stateChangeLog.warn(
4103                "DIR* NameSystem.internalReleaseLease: " +
4104                "File " + src + " has not been closed." +
4105               " Lease recovery is in progress. " +
4106                "RecoveryId = " + blockRecoveryId + " for block " + lastBlock);
4107      break;
4108    }
4109    return false;
4110  }
4111
4112  private Lease reassignLease(Lease lease, String src, String newHolder,
4113      INodeFile pendingFile) {
4114    assert hasWriteLock();
4115    if(newHolder == null)
4116      return lease;
4117    // The following transaction is not synced. Make sure it's sync'ed later.
4118    logReassignLease(lease.getHolder(), src, newHolder);
4119    return reassignLeaseInternal(lease, src, newHolder, pendingFile);
4120  }
4121  
4122  Lease reassignLeaseInternal(Lease lease, String src, String newHolder,
4123      INodeFile pendingFile) {
4124    assert hasWriteLock();
4125    pendingFile.getFileUnderConstructionFeature().setClientName(newHolder);
4126    return leaseManager.reassignLease(lease, src, newHolder);
4127  }
4128
4129  private void commitOrCompleteLastBlock(final INodeFile fileINode,
4130      final INodesInPath iip, final Block commitBlock) throws IOException {
4131    assert hasWriteLock();
4132    Preconditions.checkArgument(fileINode.isUnderConstruction());
4133    blockManager.commitOrCompleteLastBlock(fileINode, commitBlock, iip);
4134  }
4135
4136  private void finalizeINodeFileUnderConstruction(String src,
4137      INodeFile pendingFile, int latestSnapshot) throws IOException {
4138    assert hasWriteLock();
4139
4140    FileUnderConstructionFeature uc = pendingFile.getFileUnderConstructionFeature();
4141    if (uc == null) {
4142      throw new IOException("Cannot finalize file " + src
4143          + " because it is not under construction");
4144    }
4145    
4146    pendingFile.recordModification(latestSnapshot);
4147
4148    // The file is no longer pending.
4149    // Create permanent INode, update blocks. No need to replace the inode here
4150    // since we just remove the uc feature from pendingFile
4151    pendingFile.toCompleteFile(now());
4152
4153    leaseManager.removeLease(uc.getClientName(), src);
4154
4155    waitForLoadingFSImage();
4156    // close file and persist block allocations for this file
4157    closeFile(src, pendingFile);
4158
4159    blockManager.checkReplication(pendingFile);
4160  }
4161
4162  @VisibleForTesting
4163  BlockInfoContiguous getStoredBlock(Block block) {
4164    return blockManager.getStoredBlock(block);
4165  }
4166  
4167  @Override
4168  public boolean isInSnapshot(BlockInfoContiguousUnderConstruction blockUC) {
4169    assert hasReadLock();
4170    final BlockCollection bc = blockUC.getBlockCollection();
4171    if (bc == null || !(bc instanceof INodeFile)
4172        || !bc.isUnderConstruction()) {
4173      return false;
4174    }
4175
4176    String fullName = bc.getName();
4177    try {
4178      if (fullName != null && fullName.startsWith(Path.SEPARATOR)
4179          && dir.getINode(fullName) == bc) {
4180        // If file exists in normal path then no need to look in snapshot
4181        return false;
4182      }
4183    } catch (UnresolvedLinkException e) {
4184      LOG.error("Error while resolving the link : " + fullName, e);
4185      return false;
4186    }
4187    /*
4188     * 1. if bc is under construction and also with snapshot, and
4189     * bc is not in the current fsdirectory tree, bc must represent a snapshot
4190     * file. 
4191     * 2. if fullName is not an absolute path, bc cannot be existent in the 
4192     * current fsdirectory tree. 
4193     * 3. if bc is not the current node associated with fullName, bc must be a
4194     * snapshot inode.
4195     */
4196    return true;
4197  }
4198
4199  void commitBlockSynchronization(ExtendedBlock oldBlock,
4200      long newgenerationstamp, long newlength,
4201      boolean closeFile, boolean deleteblock, DatanodeID[] newtargets,
4202      String[] newtargetstorages) throws IOException {
4203    LOG.info("commitBlockSynchronization(oldBlock=" + oldBlock
4204             + ", newgenerationstamp=" + newgenerationstamp
4205             + ", newlength=" + newlength
4206             + ", newtargets=" + Arrays.asList(newtargets)
4207             + ", closeFile=" + closeFile
4208             + ", deleteBlock=" + deleteblock
4209             + ")");
4210    checkOperation(OperationCategory.WRITE);
4211    final String src;
4212    waitForLoadingFSImage();
4213    writeLock();
4214    try {
4215      checkOperation(OperationCategory.WRITE);
4216      // If a DN tries to commit to the standby, the recovery will
4217      // fail, and the next retry will succeed on the new NN.
4218  
4219      checkNameNodeSafeMode(
4220          "Cannot commitBlockSynchronization while in safe mode");
4221      final BlockInfoContiguous storedBlock = getStoredBlock(
4222          ExtendedBlock.getLocalBlock(oldBlock));
4223      if (storedBlock == null) {
4224        if (deleteblock) {
4225          // This may be a retry attempt so ignore the failure
4226          // to locate the block.
4227          if (LOG.isDebugEnabled()) {
4228            LOG.debug("Block (=" + oldBlock + ") not found");
4229          }
4230          return;
4231        } else {
4232          throw new IOException("Block (=" + oldBlock + ") not found");
4233        }
4234      }
4235      final long oldGenerationStamp = storedBlock.getGenerationStamp();
4236      final long oldNumBytes = storedBlock.getNumBytes();
4237      //
4238      // The implementation of delete operation (see @deleteInternal method)
4239      // first removes the file paths from namespace, and delays the removal
4240      // of blocks to later time for better performance. When
4241      // commitBlockSynchronization (this method) is called in between, the
4242      // blockCollection of storedBlock could have been assigned to null by
4243      // the delete operation, throw IOException here instead of NPE; if the
4244      // file path is already removed from namespace by the delete operation,
4245      // throw FileNotFoundException here, so not to proceed to the end of
4246      // this method to add a CloseOp to the edit log for an already deleted
4247      // file (See HDFS-6825).
4248      //
4249      BlockCollection blockCollection = storedBlock.getBlockCollection();
4250      if (blockCollection == null) {
4251        throw new IOException("The blockCollection of " + storedBlock
4252            + " is null, likely because the file owning this block was"
4253            + " deleted and the block removal is delayed");
4254      }
4255      INodeFile iFile = ((INode)blockCollection).asFile();
4256      src = iFile.getFullPathName();
4257      if (isFileDeleted(iFile)) {
4258        throw new FileNotFoundException("File not found: "
4259            + src + ", likely due to delayed block removal");
4260      }
4261      if ((!iFile.isUnderConstruction() || storedBlock.isComplete()) &&
4262          iFile.getLastBlock().isComplete()) {
4263        if (LOG.isDebugEnabled()) {
4264          LOG.debug("Unexpected block (=" + oldBlock
4265                    + ") since the file (=" + iFile.getLocalName()
4266                    + ") is not under construction");
4267        }
4268        return;
4269      }
4270
4271      BlockInfoContiguousUnderConstruction truncatedBlock =
4272          (BlockInfoContiguousUnderConstruction) iFile.getLastBlock();
4273      long recoveryId = truncatedBlock.getBlockRecoveryId();
4274      boolean copyTruncate =
4275          truncatedBlock.getBlockId() != storedBlock.getBlockId();
4276      if(recoveryId != newgenerationstamp) {
4277        throw new IOException("The recovery id " + newgenerationstamp
4278                              + " does not match current recovery id "
4279                              + recoveryId + " for block " + oldBlock);
4280      }
4281
4282      if (deleteblock) {
4283        Block blockToDel = ExtendedBlock.getLocalBlock(oldBlock);
4284        boolean remove = iFile.removeLastBlock(blockToDel);
4285        if (remove) {
4286          blockManager.removeBlock(storedBlock);
4287        }
4288      }
4289      else {
4290        // update last block
4291        if(!copyTruncate) {
4292          storedBlock.setGenerationStamp(newgenerationstamp);
4293          storedBlock.setNumBytes(newlength);
4294        }
4295
4296        // find the DatanodeDescriptor objects
4297        ArrayList<DatanodeDescriptor> trimmedTargets =
4298            new ArrayList<DatanodeDescriptor>(newtargets.length);
4299        ArrayList<String> trimmedStorages =
4300            new ArrayList<String>(newtargets.length);
4301        if (newtargets.length > 0) {
4302          for (int i = 0; i < newtargets.length; ++i) {
4303            // try to get targetNode
4304            DatanodeDescriptor targetNode =
4305                blockManager.getDatanodeManager().getDatanode(newtargets[i]);
4306            if (targetNode != null) {
4307              trimmedTargets.add(targetNode);
4308              trimmedStorages.add(newtargetstorages[i]);
4309            } else if (LOG.isDebugEnabled()) {
4310              LOG.debug("DatanodeDescriptor (=" + newtargets[i] + ") not found");
4311            }
4312          }
4313        }
4314        if ((closeFile) && !trimmedTargets.isEmpty()) {
4315          // the file is getting closed. Insert block locations into blockManager.
4316          // Otherwise fsck will report these blocks as MISSING, especially if the
4317          // blocksReceived from Datanodes take a long time to arrive.
4318          for (int i = 0; i < trimmedTargets.size(); i++) {
4319            DatanodeStorageInfo storageInfo =
4320                trimmedTargets.get(i).getStorageInfo(trimmedStorages.get(i));
4321            if (storageInfo != null) {
4322              if(copyTruncate) {
4323                storageInfo.addBlock(truncatedBlock);
4324              } else {
4325                storageInfo.addBlock(storedBlock);
4326              }
4327            }
4328          }
4329        }
4330
4331        // add pipeline locations into the INodeUnderConstruction
4332        DatanodeStorageInfo[] trimmedStorageInfos =
4333            blockManager.getDatanodeManager().getDatanodeStorageInfos(
4334                trimmedTargets.toArray(new DatanodeID[trimmedTargets.size()]),
4335                trimmedStorages.toArray(new String[trimmedStorages.size()]),
4336                "src=%s, oldBlock=%s, newgenerationstamp=%d, newlength=%d",
4337                src, oldBlock, newgenerationstamp, newlength);
4338
4339        if(copyTruncate) {
4340          iFile.setLastBlock(truncatedBlock, trimmedStorageInfos);
4341        } else {
4342          iFile.setLastBlock(storedBlock, trimmedStorageInfos);
4343          if (closeFile) {
4344            blockManager.markBlockReplicasAsCorrupt(storedBlock,
4345                oldGenerationStamp, oldNumBytes, trimmedStorageInfos);
4346          }
4347        }
4348      }
4349
4350      if (closeFile) {
4351        if(copyTruncate) {
4352          closeFileCommitBlocks(src, iFile, truncatedBlock);
4353          if(!iFile.isBlockInLatestSnapshot(storedBlock)) {
4354            blockManager.removeBlock(storedBlock);
4355          }
4356        } else {
4357          closeFileCommitBlocks(src, iFile, storedBlock);
4358        }
4359      } else {
4360        // If this commit does not want to close the file, persist blocks
4361        persistBlocks(src, iFile, false);
4362      }
4363    } finally {
4364      writeUnlock("commitBlockSynchronization");
4365    }
4366    getEditLog().logSync();
4367    if (closeFile) {
4368      LOG.info("commitBlockSynchronization(oldBlock=" + oldBlock
4369          + ", file=" + src
4370          + ", newgenerationstamp=" + newgenerationstamp
4371          + ", newlength=" + newlength
4372          + ", newtargets=" + Arrays.asList(newtargets) + ") successful");
4373    } else {
4374      LOG.info("commitBlockSynchronization(" + oldBlock + ") successful");
4375    }
4376  }
4377
4378  /**
4379   * @param pendingFile open file that needs to be closed
4380   * @param storedBlock last block
4381   * @throws IOException on error
4382   */
4383  @VisibleForTesting
4384  void closeFileCommitBlocks(String src, INodeFile pendingFile,
4385      BlockInfoContiguous storedBlock) throws IOException {
4386    final INodesInPath iip = INodesInPath.fromINode(pendingFile);
4387
4388    // commit the last block and complete it if it has minimum replicas
4389    commitOrCompleteLastBlock(pendingFile, iip, storedBlock);
4390
4391    //remove lease, close file
4392    finalizeINodeFileUnderConstruction(src, pendingFile,
4393        Snapshot.findLatestSnapshot(pendingFile, Snapshot.CURRENT_STATE_ID));
4394  }
4395
4396  /**
4397   * Renew the lease(s) held by the given client
4398   */
4399  void renewLease(String holder) throws IOException {
4400    checkOperation(OperationCategory.WRITE);
4401    readLock();
4402    try {
4403      checkOperation(OperationCategory.WRITE);
4404      checkNameNodeSafeMode("Cannot renew lease for " + holder);
4405      leaseManager.renewLease(holder);
4406    } finally {
4407      readUnlock("renewLease");
4408    }
4409  }
4410
4411  /**
4412   * Get a partial listing of the indicated directory
4413   *
4414   * @param src the directory name
4415   * @param startAfter the name to start after
4416   * @param needLocation if blockLocations need to be returned
4417   * @return a partial listing starting after startAfter
4418   * 
4419   * @throws AccessControlException if access is denied
4420   * @throws UnresolvedLinkException if symbolic link is encountered
4421   * @throws IOException if other I/O error occurred
4422   */
4423  DirectoryListing getListing(String src, byte[] startAfter,
4424      boolean needLocation) 
4425      throws IOException {
4426    checkOperation(OperationCategory.READ);
4427    final String operationName = "listStatus";
4428    DirectoryListing dl = null;
4429    readLock();
4430    try {
4431      checkOperation(NameNode.OperationCategory.READ);
4432      dl = FSDirStatAndListingOp.getListingInt(dir, src, startAfter,
4433          needLocation);
4434    } catch (AccessControlException e) {
4435      logAuditEvent(false, operationName, src);
4436      throw e;
4437    } finally {
4438      readUnlock(operationName);
4439    }
4440    logAuditEvent(true, operationName, src);
4441    return dl;
4442  }
4443
4444  /////////////////////////////////////////////////////////
4445  //
4446  // These methods are called by datanodes
4447  //
4448  /////////////////////////////////////////////////////////
4449  /**
4450   * Register Datanode.
4451   * <p>
4452   * The purpose of registration is to identify whether the new datanode
4453   * serves a new data storage, and will report new data block copies,
4454   * which the namenode was not aware of; or the datanode is a replacement
4455   * node for the data storage that was previously served by a different
4456   * or the same (in terms of host:port) datanode.
4457   * The data storages are distinguished by their storageIDs. When a new
4458   * data storage is reported the namenode issues a new unique storageID.
4459   * <p>
4460   * Finally, the namenode returns its namespaceID as the registrationID
4461   * for the datanodes. 
4462   * namespaceID is a persistent attribute of the name space.
4463   * The registrationID is checked every time the datanode is communicating
4464   * with the namenode. 
4465   * Datanodes with inappropriate registrationID are rejected.
4466   * If the namenode stops, and then restarts it can restore its 
4467   * namespaceID and will continue serving the datanodes that has previously
4468   * registered with the namenode without restarting the whole cluster.
4469   * 
4470   * @see org.apache.hadoop.hdfs.server.datanode.DataNode
4471   */
4472  void registerDatanode(DatanodeRegistration nodeReg) throws IOException {
4473    writeLock();
4474    try {
4475      getBlockManager().getDatanodeManager().registerDatanode(nodeReg);
4476      checkSafeMode();
4477    } finally {
4478      writeUnlock("registerDatanode");
4479    }
4480  }
4481  
4482  /**
4483   * Get registrationID for datanodes based on the namespaceID.
4484   * 
4485   * @see #registerDatanode(DatanodeRegistration)
4486   * @return registration ID
4487   */
4488  String getRegistrationID() {
4489    return Storage.getRegistrationID(getFSImage().getStorage());
4490  }
4491
4492  /**
4493   * The given node has reported in.  This method should:
4494   * 1) Record the heartbeat, so the datanode isn't timed out
4495   * 2) Adjust usage stats for future block allocation
4496   * 
4497   * If a substantial amount of time passed since the last datanode 
4498   * heartbeat then request an immediate block report.  
4499   * 
4500   * @return an array of datanode commands 
4501   * @throws IOException
4502   */
4503  HeartbeatResponse handleHeartbeat(DatanodeRegistration nodeReg,
4504      StorageReport[] reports, long cacheCapacity, long cacheUsed,
4505      int xceiverCount, int xmitsInProgress, int failedVolumes,
4506      VolumeFailureSummary volumeFailureSummary) throws IOException {
4507    readLock();
4508    try {
4509      //get datanode commands
4510      final int maxTransfer = blockManager.getMaxReplicationStreams()
4511          - xmitsInProgress;
4512      DatanodeCommand[] cmds = blockManager.getDatanodeManager().handleHeartbeat(
4513          nodeReg, reports, blockPoolId, cacheCapacity, cacheUsed,
4514          xceiverCount, maxTransfer, failedVolumes, volumeFailureSummary);
4515      
4516      //create ha status
4517      final NNHAStatusHeartbeat haState = new NNHAStatusHeartbeat(
4518          haContext.getState().getServiceState(),
4519          getFSImage().getCorrectLastAppliedOrWrittenTxId());
4520
4521      return new HeartbeatResponse(cmds, haState, rollingUpgradeInfo);
4522    } finally {
4523      readUnlock("handleHeartbeat");
4524    }
4525  }
4526
4527  /**
4528   * Returns whether or not there were available resources at the last check of
4529   * resources.
4530   *
4531   * @return true if there were sufficient resources available, false otherwise.
4532   */
4533  boolean nameNodeHasResourcesAvailable() {
4534    return hasResourcesAvailable;
4535  }
4536
4537  /**
4538   * Perform resource checks and cache the results.
4539   */
4540  void checkAvailableResources() {
4541    Preconditions.checkState(nnResourceChecker != null,
4542        "nnResourceChecker not initialized");
4543    hasResourcesAvailable = nnResourceChecker.hasAvailableDiskSpace();
4544  }
4545
4546  /**
4547   * Persist the block list for the inode.
4548   * @param path
4549   * @param file
4550   * @param logRetryCache
4551   */
4552  private void persistBlocks(String path, INodeFile file,
4553                             boolean logRetryCache) {
4554    assert hasWriteLock();
4555    Preconditions.checkArgument(file.isUnderConstruction());
4556    getEditLog().logUpdateBlocks(path, file, logRetryCache);
4557    NameNode.stateChangeLog.debug("persistBlocks: {} with {} blocks is" +
4558        " peristed to the file system", path, file.getBlocks().length);
4559  }
4560
4561  /**
4562   * Close file.
4563   * @param path
4564   * @param file
4565   */
4566  private void closeFile(String path, INodeFile file) {
4567    assert hasWriteLock();
4568    waitForLoadingFSImage();
4569    // file is closed
4570    getEditLog().logCloseFile(path, file);
4571    NameNode.stateChangeLog.debug("closeFile: {} with {} blocks is persisted" +
4572        " to the file system", path, file.getBlocks().length);
4573  }
4574
4575  /**
4576   * Periodically calls hasAvailableResources of NameNodeResourceChecker, and if
4577   * there are found to be insufficient resources available, causes the NN to
4578   * enter safe mode. If resources are later found to have returned to
4579   * acceptable levels, this daemon will cause the NN to exit safe mode.
4580   */
4581  class NameNodeResourceMonitor implements Runnable  {
4582    boolean shouldNNRmRun = true;
4583    @Override
4584    public void run () {
4585      try {
4586        while (fsRunning && shouldNNRmRun) {
4587          checkAvailableResources();
4588          if(!nameNodeHasResourcesAvailable()) {
4589            String lowResourcesMsg = "NameNode low on available disk space. ";
4590            if (!isInSafeMode()) {
4591              LOG.warn(lowResourcesMsg + "Entering safe mode.");
4592            } else {
4593              LOG.warn(lowResourcesMsg + "Already in safe mode.");
4594            }
4595            enterSafeMode(true);
4596          }
4597          try {
4598            Thread.sleep(resourceRecheckInterval);
4599          } catch (InterruptedException ie) {
4600            // Deliberately ignore
4601          }
4602        }
4603      } catch (Exception e) {
4604        FSNamesystem.LOG.error("Exception in NameNodeResourceMonitor: ", e);
4605      }
4606    }
4607
4608    public void stopMonitor() {
4609      shouldNNRmRun = false;
4610    }
4611 }
4612
4613  class NameNodeEditLogRoller implements Runnable {
4614
4615    private boolean shouldRun = true;
4616    private final long rollThreshold;
4617    private final long sleepIntervalMs;
4618
4619    public NameNodeEditLogRoller(long rollThreshold, int sleepIntervalMs) {
4620        this.rollThreshold = rollThreshold;
4621        this.sleepIntervalMs = sleepIntervalMs;
4622    }
4623
4624    @Override
4625    public void run() {
4626      while (fsRunning && shouldRun) {
4627        try {
4628          FSEditLog editLog = getFSImage().getEditLog();
4629          long numEdits =
4630              editLog.getLastWrittenTxId() - editLog.getCurSegmentTxId();
4631          if (numEdits > rollThreshold) {
4632            FSNamesystem.LOG.info("NameNode rolling its own edit log because"
4633                + " number of edits in open segment exceeds threshold of "
4634                + rollThreshold);
4635            rollEditLog();
4636          }
4637        } catch (Exception e) {
4638          FSNamesystem.LOG.error("Swallowing exception in "
4639              + NameNodeEditLogRoller.class.getSimpleName() + ":", e);
4640        }
4641        try {
4642          Thread.sleep(sleepIntervalMs);
4643        } catch (InterruptedException e) {
4644          FSNamesystem.LOG.info(NameNodeEditLogRoller.class.getSimpleName()
4645              + " was interrupted, exiting");
4646          break;
4647        }
4648      }
4649    }
4650
4651    public void stop() {
4652      shouldRun = false;
4653    }
4654  }
4655
4656  /**
4657   * Daemon to periodically scan the namespace for lazyPersist files
4658   * with missing blocks and unlink them.
4659   */
4660  class LazyPersistFileScrubber implements Runnable {
4661    private volatile boolean shouldRun = true;
4662    final int scrubIntervalSec;
4663    public LazyPersistFileScrubber(final int scrubIntervalSec) {
4664      this.scrubIntervalSec = scrubIntervalSec;
4665    }
4666
4667    /**
4668     * Periodically go over the list of lazyPersist files with missing
4669     * blocks and unlink them from the namespace.
4670     */
4671    private void clearCorruptLazyPersistFiles()
4672        throws IOException {
4673
4674      BlockStoragePolicy lpPolicy = blockManager.getStoragePolicy("LAZY_PERSIST");
4675
4676      List<BlockCollection> filesToDelete = new ArrayList<>();
4677      boolean changed = false;
4678      writeLock();
4679      try {
4680        final Iterator<Block> it = blockManager.getCorruptReplicaBlockIterator();
4681
4682        while (it.hasNext()) {
4683          Block b = it.next();
4684          BlockInfoContiguous blockInfo = blockManager.getStoredBlock(b);
4685          if (blockInfo == null) {
4686            LOG.info("Cannot find block info for block " + b);
4687          } else {
4688            if (blockInfo.getBlockCollection().getStoragePolicyID()
4689                == lpPolicy.getId()) {
4690              filesToDelete.add(blockInfo.getBlockCollection());
4691            }
4692          }
4693        }
4694
4695        for (BlockCollection bc : filesToDelete) {
4696          LOG.warn("Removing lazyPersist file " + bc.getName() + " with no replicas.");
4697          BlocksMapUpdateInfo toRemoveBlocks =
4698              FSDirDeleteOp.deleteInternal(
4699                  FSNamesystem.this, bc.getName(),
4700                  INodesInPath.fromINode((INodeFile) bc), false);
4701          changed |= toRemoveBlocks != null;
4702          if (toRemoveBlocks != null) {
4703            removeBlocks(toRemoveBlocks); // Incremental deletion of blocks
4704          }
4705        }
4706      } finally {
4707        writeUnlock("clearCorruptLazyPersistFiles");
4708      }
4709      if (changed) {
4710        getEditLog().logSync();
4711      }
4712    }
4713
4714    @Override
4715    public void run() {
4716      while (fsRunning && shouldRun) {
4717        try {
4718          clearCorruptLazyPersistFiles();
4719        } catch (Exception e) {
4720          FSNamesystem.LOG.error(
4721              "Ignoring exception in LazyPersistFileScrubber:", e);
4722        }
4723
4724        try {
4725          Thread.sleep(scrubIntervalSec * 1000);
4726        } catch (InterruptedException e) {
4727          FSNamesystem.LOG.info(
4728              "LazyPersistFileScrubber was interrupted, exiting");
4729          break;
4730        }
4731      }
4732    }
4733
4734    public void stop() {
4735      shouldRun = false;
4736    }
4737  }
4738
4739  public FSImage getFSImage() {
4740    return fsImage;
4741  }
4742
4743  public FSEditLog getEditLog() {
4744    return getFSImage().getEditLog();
4745  }    
4746
4747  private void checkBlock(ExtendedBlock block) throws IOException {
4748    if (block != null && !this.blockPoolId.equals(block.getBlockPoolId())) {
4749      throw new IOException("Unexpected BlockPoolId " + block.getBlockPoolId()
4750          + " - expected " + blockPoolId);
4751    }
4752  }
4753
4754  @Metric({"MissingBlocks", "Number of missing blocks"})
4755  public long getMissingBlocksCount() {
4756    // not locking
4757    return blockManager.getMissingBlocksCount();
4758  }
4759
4760  @Metric({"MissingReplOneBlocks", "Number of missing blocks " +
4761      "with replication factor 1"})
4762  public long getMissingReplOneBlocksCount() {
4763    // not locking
4764    return blockManager.getMissingReplOneBlocksCount();
4765  }
4766  
4767  @Metric({"ExpiredHeartbeats", "Number of expired heartbeats"})
4768  public int getExpiredHeartbeats() {
4769    return datanodeStatistics.getExpiredHeartbeats();
4770  }
4771  
4772  @Metric({"TransactionsSinceLastCheckpoint",
4773      "Number of transactions since last checkpoint"})
4774  public long getTransactionsSinceLastCheckpoint() {
4775    return getEditLog().getLastWrittenTxIdWithoutLock() -
4776        getFSImage().getStorage().getMostRecentCheckpointTxId();
4777  }
4778  
4779  @Metric({"TransactionsSinceLastLogRoll",
4780      "Number of transactions since last edit log roll"})
4781  public long getTransactionsSinceLastLogRoll() {
4782    if (isInStandbyState() || !getEditLog().isSegmentOpenWithoutLock()) {
4783      return 0;
4784    } else {
4785      return getEditLog().getLastWrittenTxIdWithoutLock() -
4786          getEditLog().getCurSegmentTxIdWithoutLock() + 1;
4787    }
4788  }
4789
4790  @Metric({"LastWrittenTransactionId", "Transaction ID written to the edit log"})
4791  public long getLastWrittenTransactionId() {
4792    return getEditLog().getLastWrittenTxIdWithoutLock();
4793  }
4794  
4795  @Metric({"LastCheckpointTime",
4796      "Time in milliseconds since the epoch of the last checkpoint"})
4797  public long getLastCheckpointTime() {
4798    return getFSImage().getStorage().getMostRecentCheckpointTime();
4799  }
4800
4801  /** @see ClientProtocol#getStats() */
4802  long[] getStats() {
4803    final long[] stats = datanodeStatistics.getStats();
4804    stats[ClientProtocol.GET_STATS_UNDER_REPLICATED_IDX] = getUnderReplicatedBlocks();
4805    stats[ClientProtocol.GET_STATS_CORRUPT_BLOCKS_IDX] = getCorruptReplicaBlocks();
4806    stats[ClientProtocol.GET_STATS_MISSING_BLOCKS_IDX] = getMissingBlocksCount();
4807    stats[ClientProtocol.GET_STATS_MISSING_REPL_ONE_BLOCKS_IDX] =
4808        getMissingReplOneBlocksCount();
4809    return stats;
4810  }
4811
4812  @Override // FSNamesystemMBean
4813  @Metric({"CapacityTotal",
4814      "Total raw capacity of data nodes in bytes"})
4815  public long getCapacityTotal() {
4816    return datanodeStatistics.getCapacityTotal();
4817  }
4818
4819  @Metric({"CapacityTotalGB",
4820      "Total raw capacity of data nodes in GB"})
4821  public float getCapacityTotalGB() {
4822    return DFSUtil.roundBytesToGB(getCapacityTotal());
4823  }
4824
4825  @Override // FSNamesystemMBean
4826  @Metric({"CapacityUsed",
4827      "Total used capacity across all data nodes in bytes"})
4828  public long getCapacityUsed() {
4829    return datanodeStatistics.getCapacityUsed();
4830  }
4831
4832  @Metric({"CapacityUsedGB",
4833      "Total used capacity across all data nodes in GB"})
4834  public float getCapacityUsedGB() {
4835    return DFSUtil.roundBytesToGB(getCapacityUsed());
4836  }
4837
4838  @Override // FSNamesystemMBean
4839  @Metric({"CapacityRemaining", "Remaining capacity in bytes"})
4840  public long getCapacityRemaining() {
4841    return datanodeStatistics.getCapacityRemaining();
4842  }
4843
4844  @Metric({"CapacityRemainingGB", "Remaining capacity in GB"})
4845  public float getCapacityRemainingGB() {
4846    return DFSUtil.roundBytesToGB(getCapacityRemaining());
4847  }
4848
4849  @Metric({"CapacityUsedNonDFS",
4850      "Total space used by data nodes for non DFS purposes in bytes"})
4851  public long getCapacityUsedNonDFS() {
4852    return datanodeStatistics.getCapacityUsedNonDFS();
4853  }
4854
4855  /**
4856   * Total number of connections.
4857   */
4858  @Override // FSNamesystemMBean
4859  @Metric
4860  public int getTotalLoad() {
4861    return datanodeStatistics.getXceiverCount();
4862  }
4863  
4864  @Metric({ "SnapshottableDirectories", "Number of snapshottable directories" })
4865  public int getNumSnapshottableDirs() {
4866    return this.snapshotManager.getNumSnapshottableDirs();
4867  }
4868
4869  @Metric({ "Snapshots", "The number of snapshots" })
4870  public int getNumSnapshots() {
4871    return this.snapshotManager.getNumSnapshots();
4872  }
4873
4874  @Override
4875  public String getSnapshotStats() {
4876    Map<String, Object> info = new HashMap<String, Object>();
4877    info.put("SnapshottableDirectories", this.getNumSnapshottableDirs());
4878    info.put("Snapshots", this.getNumSnapshots());
4879    return JSON.toString(info);
4880  }
4881
4882  @Override // FSNamesystemMBean
4883  @Metric({ "NumEncryptionZones", "The number of encryption zones" })
4884  public int getNumEncryptionZones() {
4885    return dir.ezManager.getNumEncryptionZones();
4886  }
4887
4888  /**
4889   * Returns the length of the wait Queue for the FSNameSystemLock.
4890   *
4891   * A larger number here indicates lots of threads are waiting for
4892   * FSNameSystemLock.
4893   *
4894   * @return int - Number of Threads waiting to acquire FSNameSystemLock
4895   */
4896  @Override
4897  @Metric({"LockQueueLength", "Number of threads waiting to " +
4898      "acquire FSNameSystemLock"})
4899  public int getFsLockQueueLength() {
4900    return fsLock.getQueueLength();
4901  }
4902
4903  int getNumberOfDatanodes(DatanodeReportType type) {
4904    readLock();
4905    try {
4906      return getBlockManager().getDatanodeManager().getDatanodeListForReport(
4907          type).size(); 
4908    } finally {
4909      readUnlock("getNumberOfDatanodes");
4910    }
4911  }
4912
4913  DatanodeInfo[] datanodeReport(final DatanodeReportType type
4914      ) throws AccessControlException, StandbyException {
4915    checkSuperuserPrivilege();
4916    checkOperation(OperationCategory.UNCHECKED);
4917    readLock();
4918    try {
4919      checkOperation(OperationCategory.UNCHECKED);
4920      final DatanodeManager dm = getBlockManager().getDatanodeManager();      
4921      final List<DatanodeDescriptor> results = dm.getDatanodeListForReport(type);
4922
4923      DatanodeInfo[] arr = new DatanodeInfo[results.size()];
4924      for (int i=0; i<arr.length; i++) {
4925        arr[i] = new DatanodeInfo(results.get(i));
4926      }
4927      return arr;
4928    } finally {
4929      readUnlock("datanodeReport");
4930    }
4931  }
4932
4933  DatanodeStorageReport[] getDatanodeStorageReport(final DatanodeReportType type
4934      ) throws AccessControlException, StandbyException {
4935    checkSuperuserPrivilege();
4936    checkOperation(OperationCategory.UNCHECKED);
4937    readLock();
4938    try {
4939      checkOperation(OperationCategory.UNCHECKED);
4940      final DatanodeManager dm = getBlockManager().getDatanodeManager();      
4941      final List<DatanodeDescriptor> datanodes = dm.getDatanodeListForReport(type);
4942
4943      DatanodeStorageReport[] reports = new DatanodeStorageReport[datanodes.size()];
4944      for (int i = 0; i < reports.length; i++) {
4945        final DatanodeDescriptor d = datanodes.get(i);
4946        reports[i] = new DatanodeStorageReport(new DatanodeInfo(d),
4947            d.getStorageReports());
4948      }
4949      return reports;
4950    } finally {
4951      readUnlock("getDatanodeStorageReport");
4952    }
4953  }
4954
4955  /**
4956   * Save namespace image.
4957   * This will save current namespace into fsimage file and empty edits file.
4958   * Requires superuser privilege and safe mode.
4959   * 
4960   * @throws AccessControlException if superuser privilege is violated.
4961   * @throws IOException if 
4962   */
4963  void saveNamespace() throws AccessControlException, IOException {
4964    checkOperation(OperationCategory.UNCHECKED);
4965    checkSuperuserPrivilege();
4966
4967    cpLock();  // Block if a checkpointing is in progress on standby.
4968    readLock();
4969    try {
4970      checkOperation(OperationCategory.UNCHECKED);
4971
4972      if (!isInSafeMode()) {
4973        throw new IOException("Safe mode should be turned ON "
4974            + "in order to create namespace image.");
4975      }
4976      getFSImage().saveNamespace(this);
4977    } finally {
4978      readUnlock("saveNamespace");
4979      cpUnlock();
4980    }
4981    LOG.info("New namespace image has been created");
4982  }
4983  
4984  /**
4985   * Enables/Disables/Checks restoring failed storage replicas if the storage becomes available again.
4986   * Requires superuser privilege.
4987   * 
4988   * @throws AccessControlException if superuser privilege is violated.
4989   */
4990  boolean restoreFailedStorage(String arg) throws AccessControlException,
4991      StandbyException {
4992    checkSuperuserPrivilege();
4993    checkOperation(OperationCategory.UNCHECKED);
4994    cpLock();  // Block if a checkpointing is in progress on standby.
4995    writeLock();
4996    try {
4997      checkOperation(OperationCategory.UNCHECKED);
4998      
4999      // if it is disabled - enable it and vice versa.
5000      if(arg.equals("check"))
5001        return getFSImage().getStorage().getRestoreFailedStorage();
5002      
5003      boolean val = arg.equals("true");  // false if not
5004      getFSImage().getStorage().setRestoreFailedStorage(val);
5005      
5006      return val;
5007    } finally {
5008      writeUnlock("restoreFailedStorage");
5009      cpUnlock();
5010    }
5011  }
5012
5013  Date getStartTime() {
5014    return new Date(startTime); 
5015  }
5016    
5017  void finalizeUpgrade() throws IOException {
5018    checkSuperuserPrivilege();
5019    checkOperation(OperationCategory.UNCHECKED);
5020    cpLock();  // Block if a checkpointing is in progress on standby.
5021    writeLock();
5022    try {
5023      checkOperation(OperationCategory.UNCHECKED);
5024      getFSImage().finalizeUpgrade(this.isHaEnabled() && inActiveState());
5025    } finally {
5026      writeUnlock("finalizeUpgrade");
5027      cpUnlock();
5028    }
5029  }
5030
5031  void refreshNodes() throws IOException {
5032    checkOperation(OperationCategory.UNCHECKED);
5033    checkSuperuserPrivilege();
5034    getBlockManager().getDatanodeManager().refreshNodes(new HdfsConfiguration());
5035  }
5036
5037  void setBalancerBandwidth(long bandwidth) throws IOException {
5038    checkOperation(OperationCategory.UNCHECKED);
5039    checkSuperuserPrivilege();
5040    getBlockManager().getDatanodeManager().setBalancerBandwidth(bandwidth);
5041  }
5042
5043  /**
5044   * Persist the new block (the last block of the given file).
5045   * @param path
5046   * @param file
5047   */
5048  private void persistNewBlock(String path, INodeFile file) {
5049    Preconditions.checkArgument(file.isUnderConstruction());
5050    getEditLog().logAddBlock(path, file);
5051    NameNode.stateChangeLog.debug("persistNewBlock: {} with new block {}," +
5052        " current total block count is {}", path,
5053        file.getLastBlock().toString(), file.getBlocks().length);
5054  }
5055
5056  /**
5057   * SafeModeInfo contains information related to the safe mode.
5058   * <p>
5059   * An instance of {@link SafeModeInfo} is created when the name node
5060   * enters safe mode.
5061   * <p>
5062   * During name node startup {@link SafeModeInfo} counts the number of
5063   * <em>safe blocks</em>, those that have at least the minimal number of
5064   * replicas, and calculates the ratio of safe blocks to the total number
5065   * of blocks in the system, which is the size of blocks in
5066   * {@link FSNamesystem#blockManager}. When the ratio reaches the
5067   * {@link #threshold} it starts the SafeModeMonitor daemon in order
5068   * to monitor whether the safe mode {@link #extension} is passed.
5069   * Then it leaves safe mode and destroys itself.
5070   * <p>
5071   * If safe mode is turned on manually then the number of safe blocks is
5072   * not tracked because the name node is not intended to leave safe mode
5073   * automatically in the case.
5074   *
5075   * @see ClientProtocol#setSafeMode(HdfsConstants.SafeModeAction, boolean)
5076   */
5077  public class SafeModeInfo {
5078    // configuration fields
5079    /** Safe mode threshold condition %.*/
5080    private final double threshold;
5081    /** Safe mode minimum number of datanodes alive */
5082    private final int datanodeThreshold;
5083    /**
5084     * Safe mode extension after the threshold.
5085     * Make it volatile so that getSafeModeTip can read the latest value
5086     * without taking a lock.
5087     */
5088    private volatile int extension;
5089    /** Min replication required by safe mode. */
5090    private final int safeReplication;
5091    /** threshold for populating needed replication queues */
5092    private final double replQueueThreshold;
5093    // internal fields
5094    /** Time when threshold was reached.
5095     * <br> -1 safe mode is off
5096     * <br> 0 safe mode is on, and threshold is not reached yet
5097     * <br> >0 safe mode is on, but we are in extension period 
5098     */
5099    private long reached = -1;  
5100    private long reachedTimestamp = -1;
5101    /** Total number of blocks. */
5102    int blockTotal; 
5103    /** Number of safe blocks. */
5104    int blockSafe;
5105    /** Number of blocks needed to satisfy safe mode threshold condition */
5106    private int blockThreshold;
5107    /** Number of blocks needed before populating replication queues */
5108    private int blockReplQueueThreshold;
5109    /** time of the last status printout */
5110    private long lastStatusReport = 0;
5111    /**
5112     * Was safemode entered automatically because available resources were low.
5113     * Make it volatile so that getSafeModeTip can read the latest value
5114     * without taking a lock.
5115     */
5116    private volatile boolean resourcesLow = false;
5117    /** Should safemode adjust its block totals as blocks come in */
5118    private boolean shouldIncrementallyTrackBlocks = false;
5119    /** counter for tracking startup progress of reported blocks */
5120    private Counter awaitingReportedBlocksCounter;
5121    
5122    /**
5123     * Creates SafeModeInfo when the name node enters
5124     * automatic safe mode at startup.
5125     *  
5126     * @param conf configuration
5127     */
5128    private SafeModeInfo(Configuration conf) {
5129      this.threshold = conf.getFloat(DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_KEY,
5130          DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_DEFAULT);
5131      if(threshold > 1.0) {
5132        LOG.warn("The threshold value should't be greater than 1, threshold: " + threshold);
5133      }
5134      this.datanodeThreshold = conf.getInt(
5135        DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY,
5136        DFS_NAMENODE_SAFEMODE_MIN_DATANODES_DEFAULT);
5137      this.extension = conf.getInt(DFS_NAMENODE_SAFEMODE_EXTENSION_KEY, 0);
5138      this.safeReplication = conf.getInt(DFS_NAMENODE_REPLICATION_MIN_KEY, 
5139                                         DFS_NAMENODE_REPLICATION_MIN_DEFAULT);
5140      
5141      LOG.info(DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_KEY + " = " + threshold);
5142      LOG.info(DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY + " = " + datanodeThreshold);
5143      LOG.info(DFS_NAMENODE_SAFEMODE_EXTENSION_KEY + "     = " + extension);
5144
5145      // default to safe mode threshold (i.e., don't populate queues before leaving safe mode)
5146      this.replQueueThreshold = 
5147        conf.getFloat(DFS_NAMENODE_REPL_QUEUE_THRESHOLD_PCT_KEY,
5148                      (float) threshold);
5149      this.blockTotal = 0; 
5150      this.blockSafe = 0;
5151    }
5152
5153    /**
5154     * In the HA case, the StandbyNode can be in safemode while the namespace
5155     * is modified by the edit log tailer. In this case, the number of total
5156     * blocks changes as edits are processed (eg blocks are added and deleted).
5157     * However, we don't want to do the incremental tracking during the
5158     * startup-time loading process -- only once the initial total has been
5159     * set after the image has been loaded.
5160     */
5161    private boolean shouldIncrementallyTrackBlocks() {
5162      return shouldIncrementallyTrackBlocks;
5163    }
5164
5165    /**
5166     * Creates SafeModeInfo when safe mode is entered manually, or because
5167     * available resources are low.
5168     *
5169     * The {@link #threshold} is set to 1.5 so that it could never be reached.
5170     * {@link #blockTotal} is set to -1 to indicate that safe mode is manual.
5171     * 
5172     * @see SafeModeInfo
5173     */
5174    private SafeModeInfo(boolean resourcesLow) {
5175      this.threshold = 1.5f;  // this threshold can never be reached
5176      this.datanodeThreshold = Integer.MAX_VALUE;
5177      this.extension = Integer.MAX_VALUE;
5178      this.safeReplication = Short.MAX_VALUE + 1; // more than maxReplication
5179      this.replQueueThreshold = 1.5f; // can never be reached
5180      this.blockTotal = -1;
5181      this.blockSafe = -1;
5182      this.resourcesLow = resourcesLow;
5183      enter();
5184      reportStatus("STATE* Safe mode is ON.", true);
5185    }
5186      
5187    /**
5188     * Check if safe mode is on.
5189     * @return true if in safe mode
5190     */
5191    private synchronized boolean isOn() {
5192      doConsistencyCheck();
5193      return this.reached >= 0;
5194    }
5195      
5196    /**
5197     * Enter safe mode.
5198     */
5199    private void enter() {
5200      this.reached = 0;
5201      this.reachedTimestamp = 0;
5202    }
5203      
5204    /**
5205     * Leave safe mode.
5206     * <p>
5207     * Check for invalid, under- & over-replicated blocks in the end of startup.
5208     */
5209    private synchronized void leave() {
5210      // if not done yet, initialize replication queues.
5211      // In the standby, do not populate repl queues
5212      if (!isPopulatingReplQueues() && shouldPopulateReplQueues()) {
5213        initializeReplQueues();
5214      }
5215      long timeInSafemode = now() - startTime;
5216      NameNode.stateChangeLog.info("STATE* Leaving safe mode after " 
5217                                    + timeInSafemode/1000 + " secs");
5218      NameNode.getNameNodeMetrics().setSafeModeTime((int) timeInSafemode);
5219
5220      //Log the following only once (when transitioning from ON -> OFF)
5221      if (reached >= 0) {
5222        NameNode.stateChangeLog.info("STATE* Safe mode is OFF"); 
5223      }
5224      reached = -1;
5225      reachedTimestamp = -1;
5226      safeMode = null;
5227      final NetworkTopology nt = blockManager.getDatanodeManager().getNetworkTopology();
5228      NameNode.stateChangeLog.info("STATE* Network topology has "
5229          + nt.getNumOfRacks() + " racks and "
5230          + nt.getNumOfLeaves() + " datanodes");
5231      NameNode.stateChangeLog.info("STATE* UnderReplicatedBlocks has "
5232          + blockManager.numOfUnderReplicatedBlocks() + " blocks");
5233
5234      startSecretManagerIfNecessary();
5235
5236      // If startup has not yet completed, end safemode phase.
5237      StartupProgress prog = NameNode.getStartupProgress();
5238      if (prog.getStatus(Phase.SAFEMODE) != Status.COMPLETE) {
5239        prog.endStep(Phase.SAFEMODE, STEP_AWAITING_REPORTED_BLOCKS);
5240        prog.endPhase(Phase.SAFEMODE);
5241      }
5242    }
5243
5244    /**
5245     * Check whether we have reached the threshold for 
5246     * initializing replication queues.
5247     */
5248    private synchronized boolean canInitializeReplQueues() {
5249      return shouldPopulateReplQueues()
5250          && blockSafe >= blockReplQueueThreshold;
5251    }
5252      
5253    /** 
5254     * Safe mode can be turned off iff 
5255     * the threshold is reached and 
5256     * the extension time have passed.
5257     * @return true if can leave or false otherwise.
5258     */
5259    private synchronized boolean canLeave() {
5260      if (reached == 0) {
5261        return false;
5262      }
5263
5264      if (monotonicNow() - reached < extension) {
5265        reportStatus("STATE* Safe mode ON, in safe mode extension.", false);
5266        return false;
5267      }
5268
5269      if (needEnter()) {
5270        reportStatus("STATE* Safe mode ON, thresholds not met.", false);
5271        return false;
5272      }
5273
5274      return true;
5275    }
5276      
5277    /** 
5278     * There is no need to enter safe mode 
5279     * if DFS is empty or {@link #threshold} == 0
5280     */
5281    private boolean needEnter() {
5282      return (threshold != 0 && blockSafe < blockThreshold) ||
5283        (datanodeThreshold != 0 && getNumLiveDataNodes() < datanodeThreshold) ||
5284        (!nameNodeHasResourcesAvailable());
5285    }
5286      
5287    /**
5288     * Check and trigger safe mode if needed. 
5289     */
5290    private void checkMode() {
5291      // Have to have write-lock since leaving safemode initializes
5292      // repl queues, which requires write lock
5293      assert hasWriteLock();
5294      if (inTransitionToActive()) {
5295        return;
5296      }
5297      // if smmthread is already running, the block threshold must have been 
5298      // reached before, there is no need to enter the safe mode again
5299      if (smmthread == null && needEnter()) {
5300        enter();
5301        // check if we are ready to initialize replication queues
5302        if (canInitializeReplQueues() && !isPopulatingReplQueues()
5303            && !haEnabled) {
5304          initializeReplQueues();
5305        }
5306        reportStatus("STATE* Safe mode ON.", false);
5307        return;
5308      }
5309      // the threshold is reached or was reached before
5310      if (!isOn() ||                           // safe mode is off
5311          extension <= 0 || threshold <= 0) {  // don't need to wait
5312        this.leave(); // leave safe mode
5313        return;
5314      }
5315      if (reached > 0) {  // threshold has already been reached before
5316        reportStatus("STATE* Safe mode ON.", false);
5317        return;
5318      }
5319      // start monitor
5320      reached = monotonicNow();
5321      reachedTimestamp = now();
5322      if (smmthread == null) {
5323        smmthread = new Daemon(new SafeModeMonitor());
5324        smmthread.start();
5325        reportStatus("STATE* Safe mode extension entered.", true);
5326      }
5327
5328      // check if we are ready to initialize replication queues
5329      if (canInitializeReplQueues() && !isPopulatingReplQueues() && !haEnabled) {
5330        initializeReplQueues();
5331      }
5332    }
5333      
5334    /**
5335     * Set total number of blocks.
5336     */
5337    private synchronized void setBlockTotal(int total) {
5338      this.blockTotal = total;
5339      this.blockThreshold = (int) (blockTotal * threshold);
5340      this.blockReplQueueThreshold = 
5341        (int) (blockTotal * replQueueThreshold);
5342      if (haEnabled) {
5343        // After we initialize the block count, any further namespace
5344        // modifications done while in safe mode need to keep track
5345        // of the number of total blocks in the system.
5346        this.shouldIncrementallyTrackBlocks = true;
5347      }
5348      if(blockSafe < 0)
5349        this.blockSafe = 0;
5350      checkMode();
5351    }
5352      
5353    /**
5354     * Increment number of safe blocks if current block has 
5355     * reached minimal replication.
5356     * @param replication current replication 
5357     */
5358    private synchronized void incrementSafeBlockCount(short replication) {
5359      if (replication == safeReplication) {
5360        this.blockSafe++;
5361
5362        // Report startup progress only if we haven't completed startup yet.
5363        StartupProgress prog = NameNode.getStartupProgress();
5364        if (prog.getStatus(Phase.SAFEMODE) != Status.COMPLETE) {
5365          if (this.awaitingReportedBlocksCounter == null) {
5366            this.awaitingReportedBlocksCounter = prog.getCounter(Phase.SAFEMODE,
5367              STEP_AWAITING_REPORTED_BLOCKS);
5368          }
5369          this.awaitingReportedBlocksCounter.increment();
5370        }
5371
5372        checkMode();
5373      }
5374    }
5375      
5376    /**
5377     * Decrement number of safe blocks if current block has 
5378     * fallen below minimal replication.
5379     * @param replication current replication 
5380     */
5381    private synchronized void decrementSafeBlockCount(short replication) {
5382      if (replication == safeReplication-1) {
5383        this.blockSafe--;
5384        //blockSafe is set to -1 in manual / low resources safemode
5385        assert blockSafe >= 0 || isManual() || areResourcesLow();
5386        checkMode();
5387      }
5388    }
5389
5390    /**
5391     * Check if safe mode was entered manually
5392     */
5393    private boolean isManual() {
5394      return extension == Integer.MAX_VALUE;
5395    }
5396
5397    /**
5398     * Set manual safe mode.
5399     */
5400    private synchronized void setManual() {
5401      extension = Integer.MAX_VALUE;
5402    }
5403
5404    /**
5405     * Check if safe mode was entered due to resources being low.
5406     */
5407    private boolean areResourcesLow() {
5408      return resourcesLow;
5409    }
5410
5411    /**
5412     * Set that resources are low for this instance of safe mode.
5413     */
5414    private void setResourcesLow() {
5415      resourcesLow = true;
5416    }
5417
5418    /**
5419     * A tip on how safe mode is to be turned off: manually or automatically.
5420     */
5421    String getTurnOffTip() {
5422      if(!isOn()) {
5423        return "Safe mode is OFF.";
5424      }
5425
5426      //Manual OR low-resource safemode. (Admin intervention required)
5427      String adminMsg = "It was turned on manually. ";
5428      if (areResourcesLow()) {
5429        adminMsg = "Resources are low on NN. Please add or free up more "
5430          + "resources then turn off safe mode manually. NOTE:  If you turn off"
5431          + " safe mode before adding resources, "
5432          + "the NN will immediately return to safe mode. ";
5433      }
5434      if (isManual() || areResourcesLow()) {
5435        return adminMsg
5436          + "Use \"hdfs dfsadmin -safemode leave\" to turn safe mode off.";
5437      }
5438
5439      boolean thresholdsMet = true;
5440      int numLive = getNumLiveDataNodes();
5441      String msg = "";
5442      if (blockSafe < blockThreshold) {
5443        msg += String.format(
5444          "The reported blocks %d needs additional %d"
5445          + " blocks to reach the threshold %.4f of total blocks %d.%n",
5446          blockSafe, (blockThreshold - blockSafe) + 1, threshold, blockTotal);
5447        thresholdsMet = false;
5448      } else {
5449        msg += String.format("The reported blocks %d has reached the threshold"
5450            + " %.4f of total blocks %d. ", blockSafe, threshold, blockTotal);
5451      }
5452      if (numLive < datanodeThreshold) {
5453        msg += String.format(
5454          "The number of live datanodes %d needs an additional %d live "
5455          + "datanodes to reach the minimum number %d.%n",
5456          numLive, (datanodeThreshold - numLive), datanodeThreshold);
5457        thresholdsMet = false;
5458      } else {
5459        msg += String.format("The number of live datanodes %d has reached "
5460            + "the minimum number %d. ",
5461            numLive, datanodeThreshold);
5462      }
5463      msg += (reached > 0) ? "In safe mode extension. " : "";
5464      msg += "Safe mode will be turned off automatically ";
5465
5466      if (!thresholdsMet) {
5467        msg += "once the thresholds have been reached.";
5468      } else if (reached + extension - monotonicNow() > 0) {
5469        msg += ("in " + (reached + extension - monotonicNow()) / 1000 + " seconds.");
5470      } else {
5471        msg += "soon.";
5472      }
5473
5474      return msg;
5475    }
5476
5477    /**
5478     * Print status every 20 seconds.
5479     */
5480    private void reportStatus(String msg, boolean rightNow) {
5481      long curTime = now();
5482      if(!rightNow && (curTime - lastStatusReport < 20 * 1000))
5483        return;
5484      NameNode.stateChangeLog.info(msg + " \n" + getTurnOffTip());
5485      lastStatusReport = curTime;
5486    }
5487
5488    @Override
5489    public String toString() {
5490      String resText = "Current safe blocks = " 
5491        + blockSafe 
5492        + ". Target blocks = " + blockThreshold + " for threshold = %" + threshold
5493        + ". Minimal replication = " + safeReplication + ".";
5494      if (reached > 0) 
5495        resText += " Threshold was reached " + new Date(reachedTimestamp) + ".";
5496      return resText;
5497    }
5498      
5499    /**
5500     * Checks consistency of the class state.
5501     * This is costly so only runs if asserts are enabled.
5502     */
5503    private void doConsistencyCheck() {
5504      boolean assertsOn = false;
5505      assert assertsOn = true; // set to true if asserts are on
5506      if (!assertsOn) return;
5507      
5508      if (blockTotal == -1 && blockSafe == -1) {
5509        return; // manual safe mode
5510      }
5511      int activeBlocks = blockManager.getActiveBlockCount();
5512      if ((blockTotal != activeBlocks) &&
5513          !(blockSafe >= 0 && blockSafe <= blockTotal)) {
5514        throw new AssertionError(
5515            " SafeMode: Inconsistent filesystem state: "
5516        + "SafeMode data: blockTotal=" + blockTotal
5517        + " blockSafe=" + blockSafe + "; "
5518        + "BlockManager data: active="  + activeBlocks);
5519      }
5520    }
5521
5522    private synchronized void adjustBlockTotals(int deltaSafe, int deltaTotal) {
5523      if (!shouldIncrementallyTrackBlocks) {
5524        return;
5525      }
5526      assert haEnabled;
5527      
5528      if (LOG.isDebugEnabled()) {
5529        LOG.debug("Adjusting block totals from " +
5530            blockSafe + "/" + blockTotal + " to " +
5531            (blockSafe + deltaSafe) + "/" + (blockTotal + deltaTotal));
5532      }
5533      assert blockSafe + deltaSafe >= 0 : "Can't reduce blockSafe " +
5534        blockSafe + " by " + deltaSafe + ": would be negative";
5535      assert blockTotal + deltaTotal >= 0 : "Can't reduce blockTotal " +
5536        blockTotal + " by " + deltaTotal + ": would be negative";
5537      
5538      blockSafe += deltaSafe;
5539      setBlockTotal(blockTotal + deltaTotal);
5540    }
5541  }
5542    
5543  /**
5544   * Periodically check whether it is time to leave safe mode.
5545   * This thread starts when the threshold level is reached.
5546   *
5547   */
5548  class SafeModeMonitor implements Runnable {
5549    /** interval in msec for checking safe mode: {@value} */
5550    private static final long recheckInterval = 1000;
5551      
5552    /**
5553     */
5554    @Override
5555    public void run() {
5556      while (fsRunning) {
5557        writeLock();
5558        try {
5559          if (safeMode == null) { // Not in safe mode.
5560            break;
5561          }
5562          if (safeMode.canLeave()) {
5563            // Leave safe mode.
5564            safeMode.leave();
5565            smmthread = null;
5566            break;
5567          }
5568        } finally {
5569          writeUnlock();
5570        }
5571
5572        try {
5573          Thread.sleep(recheckInterval);
5574        } catch (InterruptedException ie) {
5575          // Ignored
5576        }
5577      }
5578      if (!fsRunning) {
5579        LOG.info("NameNode is being shutdown, exit SafeModeMonitor thread");
5580      }
5581    }
5582  }
5583    
5584  boolean setSafeMode(SafeModeAction action) throws IOException {
5585    if (action != SafeModeAction.SAFEMODE_GET) {
5586      checkSuperuserPrivilege();
5587      switch(action) {
5588      case SAFEMODE_LEAVE: // leave safe mode
5589        leaveSafeMode();
5590        break;
5591      case SAFEMODE_ENTER: // enter safe mode
5592        enterSafeMode(false);
5593        break;
5594      default:
5595        LOG.error("Unexpected safe mode action");
5596      }
5597    }
5598    return isInSafeMode();
5599  }
5600
5601  @Override
5602  public void checkSafeMode() {
5603    // safeMode is volatile, and may be set to null at any time
5604    SafeModeInfo safeMode = this.safeMode;
5605    if (safeMode != null) {
5606      safeMode.checkMode();
5607    }
5608  }
5609
5610  @Override
5611  public boolean isInSafeMode() {
5612    // safeMode is volatile, and may be set to null at any time
5613    SafeModeInfo safeMode = this.safeMode;
5614    if (safeMode == null)
5615      return false;
5616    return safeMode.isOn();
5617  }
5618
5619  @Override
5620  public boolean isInStartupSafeMode() {
5621    // safeMode is volatile, and may be set to null at any time
5622    SafeModeInfo safeMode = this.safeMode;
5623    if (safeMode == null)
5624      return false;
5625    // If the NN is in safemode, and not due to manual / low resources, we
5626    // assume it must be because of startup. If the NN had low resources during
5627    // startup, we assume it came out of startup safemode and it is now in low
5628    // resources safemode
5629    return !safeMode.isManual() && !safeMode.areResourcesLow()
5630      && safeMode.isOn();
5631  }
5632
5633  /**
5634   * Check if replication queues are to be populated
5635   * @return true when node is HAState.Active and not in the very first safemode
5636   */
5637  @Override
5638  public boolean isPopulatingReplQueues() {
5639    if (!shouldPopulateReplQueues()) {
5640      return false;
5641    }
5642    return initializedReplQueues;
5643  }
5644
5645  private boolean shouldPopulateReplQueues() {
5646    if(haContext == null || haContext.getState() == null)
5647      return false;
5648    return haContext.getState().shouldPopulateReplQueues();
5649  }
5650
5651  @Override
5652  public void incrementSafeBlockCount(int replication) {
5653    // safeMode is volatile, and may be set to null at any time
5654    SafeModeInfo safeMode = this.safeMode;
5655    if (safeMode == null)
5656      return;
5657    safeMode.incrementSafeBlockCount((short)replication);
5658  }
5659
5660  @Override
5661  public void decrementSafeBlockCount(Block b) {
5662    // safeMode is volatile, and may be set to null at any time
5663    SafeModeInfo safeMode = this.safeMode;
5664    if (safeMode == null) // mostly true
5665      return;
5666    BlockInfoContiguous storedBlock = getStoredBlock(b);
5667    if (storedBlock.isComplete()) {
5668      safeMode.decrementSafeBlockCount((short)blockManager.countNodes(b).liveReplicas());
5669    }
5670  }
5671  
5672  /**
5673   * Adjust the total number of blocks safe and expected during safe mode.
5674   * If safe mode is not currently on, this is a no-op.
5675   * @param deltaSafe the change in number of safe blocks
5676   * @param deltaTotal the change i nnumber of total blocks expected
5677   */
5678  @Override
5679  public void adjustSafeModeBlockTotals(int deltaSafe, int deltaTotal) {
5680    // safeMode is volatile, and may be set to null at any time
5681    SafeModeInfo safeMode = this.safeMode;
5682    if (safeMode == null)
5683      return;
5684    safeMode.adjustBlockTotals(deltaSafe, deltaTotal);
5685  }
5686
5687  /**
5688   * Set the total number of blocks in the system. 
5689   */
5690  public void setBlockTotal() {
5691    // safeMode is volatile, and may be set to null at any time
5692    SafeModeInfo safeMode = this.safeMode;
5693    if (safeMode == null)
5694      return;
5695    safeMode.setBlockTotal((int)getCompleteBlocksTotal());
5696  }
5697
5698  /**
5699   * Get the total number of blocks in the system. 
5700   */
5701  @Override // FSNamesystemMBean
5702  @Metric
5703  public long getBlocksTotal() {
5704    return blockManager.getTotalBlocks();
5705  }
5706
5707  /**
5708   * Get the total number of COMPLETE blocks in the system.
5709   * For safe mode only complete blocks are counted.
5710   */
5711  private long getCompleteBlocksTotal() {
5712    // Calculate number of blocks under construction
5713    long numUCBlocks = 0;
5714    readLock();
5715    numUCBlocks = leaseManager.getNumUnderConstructionBlocks();
5716    try {
5717      return getBlocksTotal() - numUCBlocks;
5718    } finally {
5719      readUnlock("getCompleteBlocksTotal");
5720    }
5721  }
5722
5723  /**
5724   * Enter safe mode. If resourcesLow is false, then we assume it is manual
5725   * @throws IOException
5726   */
5727  void enterSafeMode(boolean resourcesLow) throws IOException {
5728    writeLock();
5729    try {
5730      // Stop the secret manager, since rolling the master key would
5731      // try to write to the edit log
5732      stopSecretManager();
5733
5734      // Ensure that any concurrent operations have been fully synced
5735      // before entering safe mode. This ensures that the FSImage
5736      // is entirely stable on disk as soon as we're in safe mode.
5737      boolean isEditlogOpenForWrite = getEditLog().isOpenForWrite();
5738      // Before Editlog is in OpenForWrite mode, editLogStream will be null. So,
5739      // logSyncAll call can be called only when Edlitlog is in OpenForWrite mode
5740      if (isEditlogOpenForWrite) {
5741        getEditLog().logSyncAll();
5742      }
5743      if (!isInSafeMode()) {
5744        safeMode = new SafeModeInfo(resourcesLow);
5745        return;
5746      }
5747      if (resourcesLow) {
5748        safeMode.setResourcesLow();
5749      } else {
5750        safeMode.setManual();
5751      }
5752      if (isEditlogOpenForWrite) {
5753        getEditLog().logSyncAll();
5754      }
5755      NameNode.stateChangeLog.info("STATE* Safe mode is ON"
5756          + safeMode.getTurnOffTip());
5757    } finally {
5758      writeUnlock("enterSafeMode");
5759    }
5760  }
5761
5762  /**
5763   * Leave safe mode.
5764   */
5765  void leaveSafeMode() {
5766    writeLock();
5767    try {
5768      if (!isInSafeMode()) {
5769        NameNode.stateChangeLog.info("STATE* Safe mode is already OFF"); 
5770        return;
5771      }
5772      safeMode.leave();
5773    } finally {
5774      writeUnlock("leaveSafeMode");
5775    }
5776  }
5777    
5778  String getSafeModeTip() {
5779    // There is no need to take readLock.
5780    // Don't use isInSafeMode as this.safeMode might be set to null.
5781    // after isInSafeMode returns.
5782    boolean inSafeMode;
5783    SafeModeInfo safeMode = this.safeMode;
5784    if (safeMode == null) {
5785      inSafeMode = false;
5786    } else {
5787      inSafeMode = safeMode.isOn();
5788    }
5789
5790    if (!inSafeMode) {
5791      return "";
5792    } else {
5793      return safeMode.getTurnOffTip();
5794    }
5795  }
5796
5797  CheckpointSignature rollEditLog() throws IOException {
5798    checkSuperuserPrivilege();
5799    checkOperation(OperationCategory.JOURNAL);
5800    writeLock();
5801    try {
5802      checkOperation(OperationCategory.JOURNAL);
5803      checkNameNodeSafeMode("Log not rolled");
5804      if (Server.isRpcInvocation()) {
5805        LOG.info("Roll Edit Log from " + Server.getRemoteAddress());
5806      }
5807      return getFSImage().rollEditLog();
5808    } finally {
5809      writeUnlock("rollEditLog");
5810    }
5811  }
5812
5813  NamenodeCommand startCheckpoint(NamenodeRegistration backupNode,
5814      NamenodeRegistration activeNamenode) throws IOException {
5815    checkOperation(OperationCategory.CHECKPOINT);
5816    writeLock();
5817    try {
5818      checkOperation(OperationCategory.CHECKPOINT);
5819      checkNameNodeSafeMode("Checkpoint not started");
5820      
5821      LOG.info("Start checkpoint for " + backupNode.getAddress());
5822      NamenodeCommand cmd = getFSImage().startCheckpoint(backupNode,
5823          activeNamenode);
5824      getEditLog().logSync();
5825      return cmd;
5826    } finally {
5827      writeUnlock("startCheckpoint");
5828    }
5829  }
5830
5831  public void processIncrementalBlockReport(final DatanodeID nodeID,
5832      final StorageReceivedDeletedBlocks srdb)
5833      throws IOException {
5834    writeLock();
5835    try {
5836      blockManager.processIncrementalBlockReport(nodeID, srdb);
5837    } finally {
5838      writeUnlock("processIncrementalBlockReport");
5839    }
5840  }
5841  
5842  void endCheckpoint(NamenodeRegistration registration,
5843                            CheckpointSignature sig) throws IOException {
5844    checkOperation(OperationCategory.CHECKPOINT);
5845    readLock();
5846    try {
5847      checkOperation(OperationCategory.CHECKPOINT);
5848      checkNameNodeSafeMode("Checkpoint not ended");
5849      LOG.info("End checkpoint for " + registration.getAddress());
5850      getFSImage().endCheckpoint(sig);
5851    } finally {
5852      readUnlock("endCheckpoint");
5853    }
5854  }
5855
5856  PermissionStatus createFsOwnerPermissions(FsPermission permission) {
5857    return new PermissionStatus(fsOwner.getShortUserName(), supergroup, permission);
5858  }
5859
5860  private void checkUnreadableBySuperuser(FSPermissionChecker pc,
5861      INode inode, int snapshotId)
5862      throws IOException {
5863    if (pc.isSuperUser()) {
5864      for (XAttr xattr : FSDirXAttrOp.getXAttrs(dir, inode, snapshotId)) {
5865        if (XAttrHelper.getPrefixName(xattr).
5866            equals(SECURITY_XATTR_UNREADABLE_BY_SUPERUSER)) {
5867          throw new AccessControlException("Access is denied for " +
5868              pc.getUser() + " since the superuser is not allowed to " +
5869              "perform this operation.");
5870        }
5871      }
5872    }
5873  }
5874
5875  @Override
5876  public void checkSuperuserPrivilege()
5877      throws AccessControlException {
5878    if (isPermissionEnabled) {
5879      FSPermissionChecker pc = getPermissionChecker();
5880      pc.checkSuperuserPrivilege();
5881    }
5882  }
5883
5884  /**
5885   * Check to see if we have exceeded the limit on the number
5886   * of inodes.
5887   */
5888  void checkFsObjectLimit() throws IOException {
5889    if (maxFsObjects != 0 &&
5890        maxFsObjects <= dir.totalInodes() + getBlocksTotal()) {
5891      throw new IOException("Exceeded the configured number of objects " +
5892                             maxFsObjects + " in the filesystem.");
5893    }
5894  }
5895
5896  /**
5897   * Get the total number of objects in the system. 
5898   */
5899  @Override // FSNamesystemMBean
5900  public long getMaxObjects() {
5901    return maxFsObjects;
5902  }
5903
5904  @Override // FSNamesystemMBean
5905  @Metric
5906  public long getFilesTotal() {
5907    // There is no need to take fSNamesystem's lock as
5908    // FSDirectory has its own lock.
5909    return this.dir.totalInodes();
5910  }
5911
5912  @Override // FSNamesystemMBean
5913  @Metric
5914  public long getPendingReplicationBlocks() {
5915    return blockManager.getPendingReplicationBlocksCount();
5916  }
5917
5918  @Override // FSNamesystemMBean
5919  @Metric
5920  public long getUnderReplicatedBlocks() {
5921    return blockManager.getUnderReplicatedBlocksCount();
5922  }
5923
5924  /** Returns number of blocks with corrupt replicas */
5925  @Metric({"CorruptBlocks", "Number of blocks with corrupt replicas"})
5926  public long getCorruptReplicaBlocks() {
5927    return blockManager.getCorruptReplicaBlocksCount();
5928  }
5929
5930  @Override // FSNamesystemMBean
5931  @Metric
5932  public long getScheduledReplicationBlocks() {
5933    return blockManager.getScheduledReplicationBlocksCount();
5934  }
5935
5936  @Override
5937  @Metric
5938  public long getPendingDeletionBlocks() {
5939    return blockManager.getPendingDeletionBlocksCount();
5940  }
5941
5942  @Override
5943  public long getBlockDeletionStartTime() {
5944    return startTime + blockManager.getStartupDelayBlockDeletionInMs();
5945  }
5946
5947  @Metric
5948  public long getExcessBlocks() {
5949    return blockManager.getExcessBlocksCount();
5950  }
5951  
5952  // HA-only metric
5953  @Metric
5954  public long getPostponedMisreplicatedBlocks() {
5955    return blockManager.getPostponedMisreplicatedBlocksCount();
5956  }
5957
5958  // HA-only metric
5959  @Metric
5960  public int getPendingDataNodeMessageCount() {
5961    return blockManager.getPendingDataNodeMessageCount();
5962  }
5963  
5964  // HA-only metric
5965  @Metric
5966  public String getHAState() {
5967    return haContext.getState().toString();
5968  }
5969
5970  // HA-only metric
5971  @Metric
5972  public long getMillisSinceLastLoadedEdits() {
5973    if (isInStandbyState() && editLogTailer != null) {
5974      return monotonicNow() - editLogTailer.getLastLoadTimeMs();
5975    } else {
5976      return 0;
5977    }
5978  }
5979  
5980  @Metric
5981  public int getBlockCapacity() {
5982    return blockManager.getCapacity();
5983  }
5984
5985  @Override // FSNamesystemMBean
5986  public String getFSState() {
5987    return isInSafeMode() ? "safeMode" : "Operational";
5988  }
5989  
5990  private ObjectName mbeanName;
5991  private ObjectName mxbeanName;
5992
5993  /**
5994   * Register the FSNamesystem MBean using the name
5995   *        "hadoop:service=NameNode,name=FSNamesystemState"
5996   */
5997  private void registerMBean() {
5998    // We can only implement one MXBean interface, so we keep the old one.
5999    try {
6000      StandardMBean bean = new StandardMBean(this, FSNamesystemMBean.class);
6001      mbeanName = MBeans.register("NameNode", "FSNamesystemState", bean);
6002    } catch (NotCompliantMBeanException e) {
6003      throw new RuntimeException("Bad MBean setup", e);
6004    }
6005
6006    LOG.info("Registered FSNamesystemState MBean");
6007  }
6008
6009  /**
6010   * shutdown FSNamesystem
6011   */
6012  void shutdown() {
6013    if (snapshotManager != null) {
6014      snapshotManager.shutdown();
6015    }
6016    if (mbeanName != null) {
6017      MBeans.unregister(mbeanName);
6018      mbeanName = null;
6019    }
6020    if (mxbeanName != null) {
6021      MBeans.unregister(mxbeanName);
6022      mxbeanName = null;
6023    }
6024    if (dir != null) {
6025      dir.shutdown();
6026    }
6027    if (blockManager != null) {
6028      blockManager.shutdown();
6029    }
6030  }
6031
6032  @Override // FSNamesystemMBean
6033  public int getNumLiveDataNodes() {
6034    return getBlockManager().getDatanodeManager().getNumLiveDataNodes();
6035  }
6036
6037  @Override // FSNamesystemMBean
6038  public int getNumDeadDataNodes() {
6039    return getBlockManager().getDatanodeManager().getNumDeadDataNodes();
6040  }
6041  
6042  @Override // FSNamesystemMBean
6043  public int getNumDecomLiveDataNodes() {
6044    final List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
6045    getBlockManager().getDatanodeManager().fetchDatanodes(live, null, false);
6046    int liveDecommissioned = 0;
6047    for (DatanodeDescriptor node : live) {
6048      liveDecommissioned += node.isDecommissioned() ? 1 : 0;
6049    }
6050    return liveDecommissioned;
6051  }
6052
6053  @Override // FSNamesystemMBean
6054  public int getNumDecomDeadDataNodes() {
6055    final List<DatanodeDescriptor> dead = new ArrayList<DatanodeDescriptor>();
6056    getBlockManager().getDatanodeManager().fetchDatanodes(null, dead, false);
6057    int deadDecommissioned = 0;
6058    for (DatanodeDescriptor node : dead) {
6059      deadDecommissioned += node.isDecommissioned() ? 1 : 0;
6060    }
6061    return deadDecommissioned;
6062  }
6063
6064  @Override // FSNamesystemMBean
6065  public int getVolumeFailuresTotal() {
6066    List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
6067    getBlockManager().getDatanodeManager().fetchDatanodes(live, null, false);
6068    int volumeFailuresTotal = 0;
6069    for (DatanodeDescriptor node: live) {
6070      volumeFailuresTotal += node.getVolumeFailures();
6071    }
6072    return volumeFailuresTotal;
6073  }
6074
6075  @Override // FSNamesystemMBean
6076  public long getEstimatedCapacityLostTotal() {
6077    List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
6078    getBlockManager().getDatanodeManager().fetchDatanodes(live, null, false);
6079    long estimatedCapacityLostTotal = 0;
6080    for (DatanodeDescriptor node: live) {
6081      VolumeFailureSummary volumeFailureSummary = node.getVolumeFailureSummary();
6082      if (volumeFailureSummary != null) {
6083        estimatedCapacityLostTotal +=
6084            volumeFailureSummary.getEstimatedCapacityLostTotal();
6085      }
6086    }
6087    return estimatedCapacityLostTotal;
6088  }
6089
6090  @Override // FSNamesystemMBean
6091  public int getNumDecommissioningDataNodes() {
6092    return getBlockManager().getDatanodeManager().getDecommissioningNodes()
6093        .size();
6094  }
6095
6096  @Override // FSNamesystemMBean
6097  @Metric({"StaleDataNodes", 
6098    "Number of datanodes marked stale due to delayed heartbeat"})
6099  public int getNumStaleDataNodes() {
6100    return getBlockManager().getDatanodeManager().getNumStaleNodes();
6101  }
6102
6103  /**
6104   * Storages are marked as "content stale" after NN restart or fails over and
6105   * before NN receives the first Heartbeat followed by the first Blockreport.
6106   */
6107  @Override // FSNamesystemMBean
6108  public int getNumStaleStorages() {
6109    return getBlockManager().getDatanodeManager().getNumStaleStorages();
6110  }
6111
6112  @Override // FSNamesystemMBean
6113  public String getTopUserOpCounts() {
6114    if (!topConf.isEnabled) {
6115      return null;
6116    }
6117
6118    Date now = new Date();
6119    final List<RollingWindowManager.TopWindow> topWindows =
6120        topMetrics.getTopWindows();
6121    Map<String, Object> topMap = new TreeMap<String, Object>();
6122    topMap.put("windows", topWindows);
6123    topMap.put("timestamp", DFSUtil.dateToIso8601String(now));
6124    ObjectMapper mapper = new ObjectMapper();
6125    try {
6126      return mapper.writeValueAsString(topMap);
6127    } catch (IOException e) {
6128      LOG.warn("Failed to fetch TopUser metrics", e);
6129    }
6130    return null;
6131  }
6132
6133  /**
6134   * Increments, logs and then returns the stamp
6135   */
6136  long nextGenerationStamp(boolean legacyBlock)
6137      throws IOException, SafeModeException {
6138    assert hasWriteLock();
6139    checkNameNodeSafeMode("Cannot get next generation stamp");
6140
6141    long gs = blockIdManager.nextGenerationStamp(legacyBlock);
6142    if (legacyBlock) {
6143      getEditLog().logGenerationStampV1(gs);
6144    } else {
6145      getEditLog().logGenerationStampV2(gs);
6146    }
6147
6148    // NB: callers sync the log
6149    return gs;
6150  }
6151
6152  /**
6153   * Increments, logs and then returns the block ID
6154   */
6155  private long nextBlockId() throws IOException {
6156    assert hasWriteLock();
6157    checkNameNodeSafeMode("Cannot get next block ID");
6158    final long blockId = blockIdManager.nextBlockId();
6159    getEditLog().logAllocateBlockId(blockId);
6160    // NB: callers sync the log
6161    return blockId;
6162  }
6163
6164  private boolean isFileDeleted(INodeFile file) {
6165    // Not in the inodeMap or in the snapshot but marked deleted.
6166    if (dir.getInode(file.getId()) == null) {
6167      return true;
6168    }
6169
6170    // look at the path hierarchy to see if one parent is deleted by recursive
6171    // deletion
6172    INode tmpChild = file;
6173    INodeDirectory tmpParent = file.getParent();
6174    while (true) {
6175      if (tmpParent == null) {
6176        return true;
6177      }
6178
6179      INode childINode = tmpParent.getChild(tmpChild.getLocalNameBytes(),
6180          Snapshot.CURRENT_STATE_ID);
6181      if (childINode == null || !childINode.equals(tmpChild)) {
6182        // a newly created INode with the same name as an already deleted one
6183        // would be a different INode than the deleted one
6184        return true;
6185      }
6186
6187      if (tmpParent.isRoot()) {
6188        break;
6189      }
6190
6191      tmpChild = tmpParent;
6192      tmpParent = tmpParent.getParent();
6193    }
6194
6195    if (file.isWithSnapshot() &&
6196        file.getFileWithSnapshotFeature().isCurrentFileDeleted()) {
6197      return true;
6198    }
6199    return false;
6200  }
6201
6202  private INodeFile checkUCBlock(ExtendedBlock block,
6203      String clientName) throws IOException {
6204    assert hasWriteLock();
6205    checkNameNodeSafeMode("Cannot get a new generation stamp and an "
6206        + "access token for block " + block);
6207    
6208    // check stored block state
6209    BlockInfoContiguous storedBlock = getStoredBlock(ExtendedBlock.getLocalBlock(block));
6210    if (storedBlock == null || 
6211        storedBlock.getBlockUCState() != BlockUCState.UNDER_CONSTRUCTION) {
6212        throw new IOException(block + 
6213            " does not exist or is not under Construction" + storedBlock);
6214    }
6215    
6216    // check file inode
6217    final INodeFile file = ((INode)storedBlock.getBlockCollection()).asFile();
6218    if (file == null || !file.isUnderConstruction() || isFileDeleted(file)) {
6219      throw new IOException("The file " + storedBlock + 
6220          " belonged to does not exist or it is not under construction.");
6221    }
6222    
6223    // check lease
6224    if (clientName == null
6225        || !clientName.equals(file.getFileUnderConstructionFeature()
6226            .getClientName())) {
6227      throw new LeaseExpiredException("Lease mismatch: " + block + 
6228          " is accessed by a non lease holder " + clientName); 
6229    }
6230
6231    return file;
6232  }
6233  
6234  /**
6235   * Client is reporting some bad block locations.
6236   */
6237  void reportBadBlocks(LocatedBlock[] blocks) throws IOException {
6238    checkOperation(OperationCategory.WRITE);
6239    writeLock();
6240    try {
6241      checkOperation(OperationCategory.WRITE);
6242      for (int i = 0; i < blocks.length; i++) {
6243        ExtendedBlock blk = blocks[i].getBlock();
6244        DatanodeInfo[] nodes = blocks[i].getLocations();
6245        String[] storageIDs = blocks[i].getStorageIDs();
6246        for (int j = 0; j < nodes.length; j++) {
6247          NameNode.stateChangeLog.info("*DIR* reportBadBlocks for block: {} on"
6248              + " datanode: {}", blk, nodes[j].getXferAddr());
6249          blockManager.findAndMarkBlockAsCorrupt(blk, nodes[j],
6250              storageIDs == null ? null: storageIDs[j], 
6251              "client machine reported it");
6252        }
6253      }
6254    } finally {
6255      writeUnlock("reportBadBlocks");
6256    }
6257  }
6258
6259  /**
6260   * Get a new generation stamp together with an access token for 
6261   * a block under construction
6262   * 
6263   * This method is called for recovering a failed pipeline or setting up
6264   * a pipeline to append to a block.
6265   * 
6266   * @param block a block
6267   * @param clientName the name of a client
6268   * @return a located block with a new generation stamp and an access token
6269   * @throws IOException if any error occurs
6270   */
6271  LocatedBlock updateBlockForPipeline(ExtendedBlock block, 
6272      String clientName) throws IOException {
6273    LocatedBlock locatedBlock;
6274    checkOperation(OperationCategory.WRITE);
6275    writeLock();
6276    try {
6277      checkOperation(OperationCategory.WRITE);
6278
6279      // check vadility of parameters
6280      checkUCBlock(block, clientName);
6281  
6282      // get a new generation stamp and an access token
6283      block.setGenerationStamp(nextGenerationStamp(blockIdManager.isLegacyBlock(block.getLocalBlock())));
6284      locatedBlock = new LocatedBlock(block, new DatanodeInfo[0]);
6285      blockManager.setBlockToken(locatedBlock, AccessMode.WRITE);
6286    } finally {
6287      writeUnlock("bumpBlockGenerationStamp");
6288    }
6289    // Ensure we record the new generation stamp
6290    getEditLog().logSync();
6291    return locatedBlock;
6292  }
6293  
6294  /**
6295   * Update a pipeline for a block under construction
6296   * 
6297   * @param clientName the name of the client
6298   * @param oldBlock and old block
6299   * @param newBlock a new block with a new generation stamp and length
6300   * @param newNodes datanodes in the pipeline
6301   * @throws IOException if any error occurs
6302   */
6303  void updatePipeline(
6304      String clientName, ExtendedBlock oldBlock, ExtendedBlock newBlock,
6305      DatanodeID[] newNodes, String[] newStorageIDs, boolean logRetryCache)
6306      throws IOException {
6307    LOG.info("updatePipeline(" + oldBlock.getLocalBlock()
6308             + ", newGS=" + newBlock.getGenerationStamp()
6309             + ", newLength=" + newBlock.getNumBytes()
6310             + ", newNodes=" + Arrays.asList(newNodes)
6311             + ", client=" + clientName
6312             + ")");
6313    waitForLoadingFSImage();
6314    writeLock();
6315    try {
6316      checkOperation(OperationCategory.WRITE);
6317      checkNameNodeSafeMode("Pipeline not updated");
6318      assert newBlock.getBlockId()==oldBlock.getBlockId() : newBlock + " and "
6319        + oldBlock + " has different block identifier";
6320      updatePipelineInternal(clientName, oldBlock, newBlock, newNodes,
6321          newStorageIDs, logRetryCache);
6322    } finally {
6323      writeUnlock("updatePipeline");
6324    }
6325    getEditLog().logSync();
6326    LOG.info("updatePipeline(" + oldBlock.getLocalBlock() + " => "
6327        + newBlock.getLocalBlock() + ") success");
6328  }
6329
6330  private void updatePipelineInternal(String clientName, ExtendedBlock oldBlock,
6331      ExtendedBlock newBlock, DatanodeID[] newNodes, String[] newStorageIDs,
6332      boolean logRetryCache)
6333      throws IOException {
6334    assert hasWriteLock();
6335    // check the vadility of the block and lease holder name
6336    final INodeFile pendingFile = checkUCBlock(oldBlock, clientName);
6337    final String src = pendingFile.getFullPathName();
6338    final BlockInfoContiguousUnderConstruction blockinfo
6339        = (BlockInfoContiguousUnderConstruction)pendingFile.getLastBlock();
6340
6341    // check new GS & length: this is not expected
6342    if (newBlock.getGenerationStamp() <= blockinfo.getGenerationStamp() ||
6343        newBlock.getNumBytes() < blockinfo.getNumBytes()) {
6344      String msg = "Update " + oldBlock + " (len = " + 
6345        blockinfo.getNumBytes() + ") to an older state: " + newBlock + 
6346        " (len = " + newBlock.getNumBytes() +")";
6347      LOG.warn(msg);
6348      throw new IOException(msg);
6349    }
6350
6351    // Update old block with the new generation stamp and new length
6352    blockManager.updateLastBlock(blockinfo, newBlock);
6353
6354    // find the DatanodeDescriptor objects
6355    final DatanodeStorageInfo[] storages = blockManager.getDatanodeManager()
6356        .getDatanodeStorageInfos(newNodes, newStorageIDs,
6357            "src=%s, oldBlock=%s, newBlock=%s, clientName=%s",
6358            src, oldBlock, newBlock, clientName);
6359    blockinfo.setExpectedLocations(storages);
6360
6361    persistBlocks(src, pendingFile, logRetryCache);
6362  }
6363
6364  // rename was successful. If any part of the renamed subtree had
6365  // files that were being written to, update with new filename.
6366  void unprotectedChangeLease(String src, String dst) {
6367    assert hasWriteLock();
6368    leaseManager.changeLease(src, dst);
6369  }
6370
6371  /**
6372   * Serializes leases.
6373   */
6374  void saveFilesUnderConstruction(DataOutputStream out,
6375      Map<Long, INodeFile> snapshotUCMap) throws IOException {
6376    // This is run by an inferior thread of saveNamespace, which holds a read
6377    // lock on our behalf. If we took the read lock here, we could block
6378    // for fairness if a writer is waiting on the lock.
6379    synchronized (leaseManager) {
6380      Map<String, INodeFile> nodes = leaseManager.getINodesUnderConstruction();
6381      for (Map.Entry<String, INodeFile> entry : nodes.entrySet()) {
6382        // TODO: for HDFS-5428, because of rename operations, some
6383        // under-construction files that are
6384        // in the current fs directory can also be captured in the
6385        // snapshotUCMap. We should remove them from the snapshotUCMap.
6386        snapshotUCMap.remove(entry.getValue().getId());
6387      }
6388
6389      out.writeInt(nodes.size() + snapshotUCMap.size()); // write the size
6390      for (Map.Entry<String, INodeFile> entry : nodes.entrySet()) {
6391        FSImageSerialization.writeINodeUnderConstruction(
6392            out, entry.getValue(), entry.getKey());
6393      }
6394      for (Map.Entry<Long, INodeFile> entry : snapshotUCMap.entrySet()) {
6395        // for those snapshot INodeFileUC, we use "/.reserved/.inodes/<inodeid>"
6396        // as their paths
6397        StringBuilder b = new StringBuilder();
6398        b.append(FSDirectory.DOT_RESERVED_PATH_PREFIX)
6399            .append(Path.SEPARATOR).append(FSDirectory.DOT_INODES_STRING)
6400            .append(Path.SEPARATOR).append(entry.getValue().getId());
6401        FSImageSerialization.writeINodeUnderConstruction(
6402            out, entry.getValue(), b.toString());
6403      }
6404    }
6405  }
6406
6407  /**
6408   * @return all the under-construction files in the lease map
6409   */
6410  Map<String, INodeFile> getFilesUnderConstruction() {
6411    synchronized (leaseManager) {
6412      return leaseManager.getINodesUnderConstruction();
6413    }
6414  }
6415
6416  /**
6417   * Register a Backup name-node, verifying that it belongs
6418   * to the correct namespace, and adding it to the set of
6419   * active journals if necessary.
6420   * 
6421   * @param bnReg registration of the new BackupNode
6422   * @param nnReg registration of this NameNode
6423   * @throws IOException if the namespace IDs do not match
6424   */
6425  void registerBackupNode(NamenodeRegistration bnReg,
6426      NamenodeRegistration nnReg) throws IOException {
6427    writeLock();
6428    try {
6429      if(getFSImage().getStorage().getNamespaceID() 
6430         != bnReg.getNamespaceID())
6431        throw new IOException("Incompatible namespaceIDs: "
6432            + " Namenode namespaceID = "
6433            + getFSImage().getStorage().getNamespaceID() + "; "
6434            + bnReg.getRole() +
6435            " node namespaceID = " + bnReg.getNamespaceID());
6436      if (bnReg.getRole() == NamenodeRole.BACKUP) {
6437        getFSImage().getEditLog().registerBackupNode(
6438            bnReg, nnReg);
6439      }
6440    } finally {
6441      writeUnlock("registerBackupNode");
6442    }
6443  }
6444
6445  /**
6446   * Release (unregister) backup node.
6447   * <p>
6448   * Find and remove the backup stream corresponding to the node.
6449   * @throws IOException
6450   */
6451  void releaseBackupNode(NamenodeRegistration registration)
6452    throws IOException {
6453    checkOperation(OperationCategory.WRITE);
6454    writeLock();
6455    try {
6456      checkOperation(OperationCategory.WRITE);
6457      if(getFSImage().getStorage().getNamespaceID()
6458         != registration.getNamespaceID())
6459        throw new IOException("Incompatible namespaceIDs: "
6460            + " Namenode namespaceID = "
6461            + getFSImage().getStorage().getNamespaceID() + "; "
6462            + registration.getRole() +
6463            " node namespaceID = " + registration.getNamespaceID());
6464      getEditLog().releaseBackupStream(registration);
6465    } finally {
6466      writeUnlock("releaseBackupNode");
6467    }
6468  }
6469
6470  static class CorruptFileBlockInfo {
6471    final String path;
6472    final Block block;
6473    
6474    public CorruptFileBlockInfo(String p, Block b) {
6475      path = p;
6476      block = b;
6477    }
6478    
6479    @Override
6480    public String toString() {
6481      return block.getBlockName() + "\t" + path;
6482    }
6483  }
6484  /**
6485   * @param path Restrict corrupt files to this portion of namespace.
6486   * @param cookieTab Support for continuation; cookieTab  tells where
6487   *                  to start from
6488   * @return a list in which each entry describes a corrupt file/block
6489   * @throws IOException
6490   */
6491  Collection<CorruptFileBlockInfo> listCorruptFileBlocks(String path,
6492  String[] cookieTab) throws IOException {
6493    checkSuperuserPrivilege();
6494    checkOperation(OperationCategory.READ);
6495
6496    int count = 0;
6497    ArrayList<CorruptFileBlockInfo> corruptFiles =
6498        new ArrayList<CorruptFileBlockInfo>();
6499    if (cookieTab == null) {
6500      cookieTab = new String[] { null };
6501    }
6502
6503    // Do a quick check if there are any corrupt files without taking the lock
6504    if (blockManager.getMissingBlocksCount() == 0) {
6505      if (cookieTab[0] == null) {
6506        cookieTab[0] = String.valueOf(getIntCookie(cookieTab[0]));
6507      }
6508      if (LOG.isDebugEnabled()) {
6509        LOG.debug("there are no corrupt file blocks.");
6510      }
6511      return corruptFiles;
6512    }
6513
6514    readLock();
6515    try {
6516      checkOperation(OperationCategory.READ);
6517      if (!isPopulatingReplQueues()) {
6518        throw new IOException("Cannot run listCorruptFileBlocks because " +
6519                              "replication queues have not been initialized.");
6520      }
6521      // print a limited # of corrupt files per call
6522
6523      final Iterator<Block> blkIterator = blockManager.getCorruptReplicaBlockIterator();
6524
6525      int skip = getIntCookie(cookieTab[0]);
6526      for (int i = 0; i < skip && blkIterator.hasNext(); i++) {
6527        blkIterator.next();
6528      }
6529
6530      while (blkIterator.hasNext()) {
6531        Block blk = blkIterator.next();
6532        final INode inode = (INode)blockManager.getBlockCollection(blk);
6533        skip++;
6534        if (inode != null && blockManager.countNodes(blk).liveReplicas() == 0) {
6535          String src = inode.getFullPathName();
6536          if (src.startsWith(path)){
6537            corruptFiles.add(new CorruptFileBlockInfo(src, blk));
6538            count++;
6539            if (count >= DEFAULT_MAX_CORRUPT_FILEBLOCKS_RETURNED)
6540              break;
6541          }
6542        }
6543      }
6544      cookieTab[0] = String.valueOf(skip);
6545      if (LOG.isDebugEnabled()) {
6546        LOG.debug("list corrupt file blocks returned: " + count);
6547      }
6548      return corruptFiles;
6549    } finally {
6550      readUnlock("listCorruptFileBlocks");
6551    }
6552  }
6553
6554  /**
6555   * Convert string cookie to integer.
6556   */
6557  private static int getIntCookie(String cookie){
6558    int c;
6559    if(cookie == null){
6560      c = 0;
6561    } else {
6562      try{
6563        c = Integer.parseInt(cookie);
6564      }catch (NumberFormatException e) {
6565        c = 0;
6566      }
6567    }
6568    c = Math.max(0, c);
6569    return c;
6570  }
6571
6572  /**
6573   * Create delegation token secret manager
6574   */
6575  private DelegationTokenSecretManager createDelegationTokenSecretManager(
6576      Configuration conf) {
6577    return new DelegationTokenSecretManager(conf.getLong(
6578        DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_KEY,
6579        DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_DEFAULT),
6580        conf.getLong(DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_KEY,
6581            DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_DEFAULT),
6582        conf.getLong(DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_KEY,
6583            DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_DEFAULT),
6584        DELEGATION_TOKEN_REMOVER_SCAN_INTERVAL,
6585        conf.getBoolean(DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_KEY,
6586            DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_DEFAULT),
6587        this);
6588  }
6589
6590  /**
6591   * Returns the DelegationTokenSecretManager instance in the namesystem.
6592   * @return delegation token secret manager object
6593   */
6594  DelegationTokenSecretManager getDelegationTokenSecretManager() {
6595    return dtSecretManager;
6596  }
6597
6598  /**
6599   * @param renewer Renewer information
6600   * @return delegation toek
6601   * @throws IOException on error
6602   */
6603  Token<DelegationTokenIdentifier> getDelegationToken(Text renewer)
6604      throws IOException {
6605    Token<DelegationTokenIdentifier> token;
6606    checkOperation(OperationCategory.WRITE);
6607    writeLock();
6608    try {
6609      checkOperation(OperationCategory.WRITE);
6610      checkNameNodeSafeMode("Cannot issue delegation token");
6611      if (!isAllowedDelegationTokenOp()) {
6612        throw new IOException(
6613          "Delegation Token can be issued only with kerberos or web authentication");
6614      }
6615      if (dtSecretManager == null || !dtSecretManager.isRunning()) {
6616        LOG.warn("trying to get DT with no secret manager running");
6617        return null;
6618      }
6619
6620      UserGroupInformation ugi = getRemoteUser();
6621      String user = ugi.getUserName();
6622      Text owner = new Text(user);
6623      Text realUser = null;
6624      if (ugi.getRealUser() != null) {
6625        realUser = new Text(ugi.getRealUser().getUserName());
6626      }
6627      DelegationTokenIdentifier dtId = new DelegationTokenIdentifier(owner,
6628        renewer, realUser);
6629      token = new Token<DelegationTokenIdentifier>(
6630        dtId, dtSecretManager);
6631      long expiryTime = dtSecretManager.getTokenExpiryTime(dtId);
6632      getEditLog().logGetDelegationToken(dtId, expiryTime);
6633    } finally {
6634      writeUnlock("getDelegationToken");
6635    }
6636    getEditLog().logSync();
6637    return token;
6638  }
6639
6640  /**
6641   * 
6642   * @param token token to renew
6643   * @return new expiryTime of the token
6644   * @throws InvalidToken if {@code token} is invalid
6645   * @throws IOException on other errors
6646   */
6647  long renewDelegationToken(Token<DelegationTokenIdentifier> token)
6648      throws InvalidToken, IOException {
6649    long expiryTime;
6650    checkOperation(OperationCategory.WRITE);
6651    writeLock();
6652    try {
6653      checkOperation(OperationCategory.WRITE);
6654
6655      checkNameNodeSafeMode("Cannot renew delegation token");
6656      if (!isAllowedDelegationTokenOp()) {
6657        throw new IOException(
6658            "Delegation Token can be renewed only with kerberos or web authentication");
6659      }
6660      String renewer = getRemoteUser().getShortUserName();
6661      expiryTime = dtSecretManager.renewToken(token, renewer);
6662      DelegationTokenIdentifier id = new DelegationTokenIdentifier();
6663      ByteArrayInputStream buf = new ByteArrayInputStream(token.getIdentifier());
6664      DataInputStream in = new DataInputStream(buf);
6665      id.readFields(in);
6666      getEditLog().logRenewDelegationToken(id, expiryTime);
6667    } finally {
6668      writeUnlock("renewDelegationToken");
6669    }
6670    getEditLog().logSync();
6671    return expiryTime;
6672  }
6673
6674  /**
6675   * 
6676   * @param token token to cancel
6677   * @throws IOException on error
6678   */
6679  void cancelDelegationToken(Token<DelegationTokenIdentifier> token)
6680      throws IOException {
6681    checkOperation(OperationCategory.WRITE);
6682    writeLock();
6683    try {
6684      checkOperation(OperationCategory.WRITE);
6685
6686      checkNameNodeSafeMode("Cannot cancel delegation token");
6687      String canceller = getRemoteUser().getUserName();
6688      DelegationTokenIdentifier id = dtSecretManager
6689        .cancelToken(token, canceller);
6690      getEditLog().logCancelDelegationToken(id);
6691    } finally {
6692      writeUnlock("cancelDelegationToken");
6693    }
6694    getEditLog().logSync();
6695  }
6696
6697  /**
6698   * @param out save state of the secret manager
6699   * @param sdPath String storage directory path
6700   */
6701  void saveSecretManagerStateCompat(DataOutputStream out, String sdPath)
6702      throws IOException {
6703    dtSecretManager.saveSecretManagerStateCompat(out, sdPath);
6704  }
6705
6706  SecretManagerState saveSecretManagerState() {
6707    return dtSecretManager.saveSecretManagerState();
6708  }
6709
6710  /**
6711   * @param in load the state of secret manager from input stream
6712   */
6713  void loadSecretManagerStateCompat(DataInput in) throws IOException {
6714    dtSecretManager.loadSecretManagerStateCompat(in);
6715  }
6716
6717  void loadSecretManagerState(SecretManagerSection s,
6718      List<SecretManagerSection.DelegationKey> keys,
6719      List<SecretManagerSection.PersistToken> tokens) throws IOException {
6720    dtSecretManager.loadSecretManagerState(new SecretManagerState(s, keys, tokens));
6721  }
6722
6723  /**
6724   * Log the updateMasterKey operation to edit logs
6725   * 
6726   * @param key new delegation key.
6727   */
6728  public void logUpdateMasterKey(DelegationKey key) {
6729    
6730    assert !isInSafeMode() :
6731      "this should never be called while in safemode, since we stop " +
6732      "the DT manager before entering safemode!";
6733    // No need to hold FSN lock since we don't access any internal
6734    // structures, and this is stopped before the FSN shuts itself
6735    // down, etc.
6736    getEditLog().logUpdateMasterKey(key);
6737    getEditLog().logSync();
6738  }
6739  
6740  /**
6741   * Log the cancellation of expired tokens to edit logs
6742   * 
6743   * @param id token identifier to cancel
6744   */
6745  public void logExpireDelegationToken(DelegationTokenIdentifier id) {
6746    assert !isInSafeMode() :
6747      "this should never be called while in safemode, since we stop " +
6748      "the DT manager before entering safemode!";
6749    // No need to hold FSN lock since we don't access any internal
6750    // structures, and this is stopped before the FSN shuts itself
6751    // down, etc.
6752    getEditLog().logCancelDelegationToken(id);
6753  }  
6754  
6755  private void logReassignLease(String leaseHolder, String src,
6756      String newHolder) {
6757    assert hasWriteLock();
6758    getEditLog().logReassignLease(leaseHolder, src, newHolder);
6759  }
6760  
6761  /**
6762   * 
6763   * @return true if delegation token operation is allowed
6764   */
6765  private boolean isAllowedDelegationTokenOp() throws IOException {
6766    AuthenticationMethod authMethod = getConnectionAuthenticationMethod();
6767    if (UserGroupInformation.isSecurityEnabled()
6768        && (authMethod != AuthenticationMethod.KERBEROS)
6769        && (authMethod != AuthenticationMethod.KERBEROS_SSL)
6770        && (authMethod != AuthenticationMethod.CERTIFICATE)) {
6771      return false;
6772    }
6773    return true;
6774  }
6775  
6776  /**
6777   * Returns authentication method used to establish the connection
6778   * @return AuthenticationMethod used to establish connection
6779   * @throws IOException
6780   */
6781  private AuthenticationMethod getConnectionAuthenticationMethod()
6782      throws IOException {
6783    UserGroupInformation ugi = getRemoteUser();
6784    AuthenticationMethod authMethod = ugi.getAuthenticationMethod();
6785    if (authMethod == AuthenticationMethod.PROXY) {
6786      authMethod = ugi.getRealUser().getAuthenticationMethod();
6787    }
6788    return authMethod;
6789  }
6790  
6791  /**
6792   * Client invoked methods are invoked over RPC and will be in 
6793   * RPC call context even if the client exits.
6794   */
6795  boolean isExternalInvocation() {
6796    return Server.isRpcInvocation() || NamenodeWebHdfsMethods.isWebHdfsInvocation();
6797  }
6798
6799  private static InetAddress getRemoteIp() {
6800    InetAddress ip = Server.getRemoteIp();
6801    if (ip != null) {
6802      return ip;
6803    }
6804    return NamenodeWebHdfsMethods.getRemoteIp();
6805  }
6806  
6807  // optimize ugi lookup for RPC operations to avoid a trip through
6808  // UGI.getCurrentUser which is synch'ed
6809  private static UserGroupInformation getRemoteUser() throws IOException {
6810    return NameNode.getRemoteUser();
6811  }
6812  
6813  /**
6814   * Log fsck event in the audit log 
6815   */
6816  void logFsckEvent(String src, InetAddress remoteAddress) throws IOException {
6817    if (isAuditEnabled()) {
6818      logAuditEvent(true, getRemoteUser(),
6819                    remoteAddress,
6820                    "fsck", src, null, null);
6821    }
6822  }
6823  /**
6824   * Register NameNodeMXBean
6825   */
6826  private void registerMXBean() {
6827    mxbeanName = MBeans.register("NameNode", "NameNodeInfo", this);
6828  }
6829
6830  /**
6831   * Class representing Namenode information for JMX interfaces
6832   */
6833  @Override // NameNodeMXBean
6834  public String getVersion() {
6835    return VersionInfo.getVersion() + ", r" + VersionInfo.getRevision();
6836  }
6837
6838  @Override // NameNodeMXBean
6839  public long getUsed() {
6840    return this.getCapacityUsed();
6841  }
6842
6843  @Override // NameNodeMXBean
6844  public long getFree() {
6845    return this.getCapacityRemaining();
6846  }
6847
6848  @Override // NameNodeMXBean
6849  public long getTotal() {
6850    return this.getCapacityTotal();
6851  }
6852
6853  @Override // NameNodeMXBean
6854  public String getSafemode() {
6855    if (!this.isInSafeMode())
6856      return "";
6857    return "Safe mode is ON. " + this.getSafeModeTip();
6858  }
6859
6860  @Override // NameNodeMXBean
6861  public boolean isUpgradeFinalized() {
6862    return this.getFSImage().isUpgradeFinalized();
6863  }
6864
6865  @Override // NameNodeMXBean
6866  public long getNonDfsUsedSpace() {
6867    return datanodeStatistics.getCapacityUsedNonDFS();
6868  }
6869
6870  @Override // NameNodeMXBean
6871  public float getPercentUsed() {
6872    return datanodeStatistics.getCapacityUsedPercent();
6873  }
6874
6875  @Override // NameNodeMXBean
6876  public long getBlockPoolUsedSpace() {
6877    return datanodeStatistics.getBlockPoolUsed();
6878  }
6879
6880  @Override // NameNodeMXBean
6881  public float getPercentBlockPoolUsed() {
6882    return datanodeStatistics.getPercentBlockPoolUsed();
6883  }
6884
6885  @Override // NameNodeMXBean
6886  public float getPercentRemaining() {
6887    return datanodeStatistics.getCapacityRemainingPercent();
6888  }
6889
6890  @Override // NameNodeMXBean
6891  public long getCacheCapacity() {
6892    return datanodeStatistics.getCacheCapacity();
6893  }
6894
6895  @Override // NameNodeMXBean
6896  public long getCacheUsed() {
6897    return datanodeStatistics.getCacheUsed();
6898  }
6899
6900  @Override // NameNodeMXBean
6901  public long getTotalBlocks() {
6902    return getBlocksTotal();
6903  }
6904
6905  @Override // NameNodeMXBean
6906  @Metric
6907  public long getTotalFiles() {
6908    return getFilesTotal();
6909  }
6910
6911  @Override // NameNodeMXBean
6912  public long getNumberOfMissingBlocks() {
6913    return getMissingBlocksCount();
6914  }
6915  
6916  @Override // NameNodeMXBean
6917  public long getNumberOfMissingBlocksWithReplicationFactorOne() {
6918    return getMissingReplOneBlocksCount();
6919  }
6920
6921  @Override // NameNodeMXBean
6922  public int getThreads() {
6923    return ManagementFactory.getThreadMXBean().getThreadCount();
6924  }
6925
6926  /**
6927   * Returned information is a JSON representation of map with host name as the
6928   * key and value is a map of live node attribute keys to its values
6929   */
6930  @Override // NameNodeMXBean
6931  public String getLiveNodes() {
6932    final Map<String, Map<String,Object>> info = 
6933      new HashMap<String, Map<String,Object>>();
6934    final List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
6935    blockManager.getDatanodeManager().fetchDatanodes(live, null, false);
6936    for (DatanodeDescriptor node : live) {
6937      ImmutableMap.Builder<String, Object> innerinfo =
6938          ImmutableMap.<String,Object>builder();
6939      innerinfo
6940          .put("infoAddr", node.getInfoAddr())
6941          .put("infoSecureAddr", node.getInfoSecureAddr())
6942          .put("xferaddr", node.getXferAddr())
6943          .put("lastContact", getLastContact(node))
6944          .put("usedSpace", getDfsUsed(node))
6945          .put("adminState", node.getAdminState().toString())
6946          .put("nonDfsUsedSpace", node.getNonDfsUsed())
6947          .put("capacity", node.getCapacity())
6948          .put("numBlocks", node.numBlocks())
6949          .put("version", node.getSoftwareVersion())
6950          .put("used", node.getDfsUsed())
6951          .put("remaining", node.getRemaining())
6952          .put("blockScheduled", node.getBlocksScheduled())
6953          .put("blockPoolUsed", node.getBlockPoolUsed())
6954          .put("blockPoolUsedPercent", node.getBlockPoolUsedPercent())
6955          .put("volfails", node.getVolumeFailures());
6956      VolumeFailureSummary volumeFailureSummary = node.getVolumeFailureSummary();
6957      if (volumeFailureSummary != null) {
6958        innerinfo
6959            .put("failedStorageLocations",
6960                volumeFailureSummary.getFailedStorageLocations())
6961            .put("lastVolumeFailureDate",
6962                volumeFailureSummary.getLastVolumeFailureDate())
6963            .put("estimatedCapacityLostTotal",
6964                volumeFailureSummary.getEstimatedCapacityLostTotal());
6965      }
6966      info.put(node.getHostName() + ":" + node.getXferPort(), innerinfo.build());
6967    }
6968    return JSON.toString(info);
6969  }
6970
6971  /**
6972   * Returned information is a JSON representation of map with host name as the
6973   * key and value is a map of dead node attribute keys to its values
6974   */
6975  @Override // NameNodeMXBean
6976  public String getDeadNodes() {
6977    final Map<String, Map<String, Object>> info = 
6978      new HashMap<String, Map<String, Object>>();
6979    final List<DatanodeDescriptor> dead = new ArrayList<DatanodeDescriptor>();
6980    blockManager.getDatanodeManager().fetchDatanodes(null, dead, false);
6981    for (DatanodeDescriptor node : dead) {
6982      Map<String, Object> innerinfo = ImmutableMap.<String, Object>builder()
6983          .put("lastContact", getLastContact(node))
6984          .put("decommissioned", node.isDecommissioned())
6985          .put("xferaddr", node.getXferAddr())
6986          .build();
6987      info.put(node.getHostName() + ":" + node.getXferPort(), innerinfo);
6988    }
6989    return JSON.toString(info);
6990  }
6991
6992  /**
6993   * Returned information is a JSON representation of map with host name as the
6994   * key and value is a map of decommissioning node attribute keys to its
6995   * values
6996   */
6997  @Override // NameNodeMXBean
6998  public String getDecomNodes() {
6999    final Map<String, Map<String, Object>> info = 
7000      new HashMap<String, Map<String, Object>>();
7001    final List<DatanodeDescriptor> decomNodeList = blockManager.getDatanodeManager(
7002        ).getDecommissioningNodes();
7003    for (DatanodeDescriptor node : decomNodeList) {
7004      Map<String, Object> innerinfo = ImmutableMap
7005          .<String, Object> builder()
7006          .put("xferaddr", node.getXferAddr())
7007          .put("underReplicatedBlocks",
7008              node.decommissioningStatus.getUnderReplicatedBlocks())
7009          .put("decommissionOnlyReplicas",
7010              node.decommissioningStatus.getDecommissionOnlyReplicas())
7011          .put("underReplicateInOpenFiles",
7012              node.decommissioningStatus.getUnderReplicatedInOpenFiles())
7013          .build();
7014      info.put(node.getHostName() + ":" + node.getXferPort(), innerinfo);
7015    }
7016    return JSON.toString(info);
7017  }
7018
7019  private long getLastContact(DatanodeDescriptor alivenode) {
7020    return (monotonicNow() - alivenode.getLastUpdateMonotonic())/1000;
7021  }
7022
7023  private long getDfsUsed(DatanodeDescriptor alivenode) {
7024    return alivenode.getDfsUsed();
7025  }
7026
7027  @Override  // NameNodeMXBean
7028  public String getClusterId() {
7029    return getFSImage().getStorage().getClusterID();
7030  }
7031  
7032  @Override  // NameNodeMXBean
7033  public String getBlockPoolId() {
7034    return blockPoolId;
7035  }
7036  
7037  @Override  // NameNodeMXBean
7038  public String getNameDirStatuses() {
7039    Map<String, Map<File, StorageDirType>> statusMap =
7040      new HashMap<String, Map<File, StorageDirType>>();
7041    
7042    Map<File, StorageDirType> activeDirs = new HashMap<File, StorageDirType>();
7043    for (Iterator<StorageDirectory> it
7044        = getFSImage().getStorage().dirIterator(); it.hasNext();) {
7045      StorageDirectory st = it.next();
7046      activeDirs.put(st.getRoot(), st.getStorageDirType());
7047    }
7048    statusMap.put("active", activeDirs);
7049    
7050    List<Storage.StorageDirectory> removedStorageDirs
7051        = getFSImage().getStorage().getRemovedStorageDirs();
7052    Map<File, StorageDirType> failedDirs = new HashMap<File, StorageDirType>();
7053    for (StorageDirectory st : removedStorageDirs) {
7054      failedDirs.put(st.getRoot(), st.getStorageDirType());
7055    }
7056    statusMap.put("failed", failedDirs);
7057    
7058    return JSON.toString(statusMap);
7059  }
7060
7061  @Override // NameNodeMXBean
7062  public String getNodeUsage() {
7063    float median = 0;
7064    float max = 0;
7065    float min = 0;
7066    float dev = 0;
7067
7068    final Map<String, Map<String,Object>> info =
7069        new HashMap<String, Map<String,Object>>();
7070    final List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
7071    blockManager.getDatanodeManager().fetchDatanodes(live, null, true);
7072
7073    if (live.size() > 0) {
7074      float totalDfsUsed = 0;
7075      float[] usages = new float[live.size()];
7076      int i = 0;
7077      for (DatanodeDescriptor dn : live) {
7078        usages[i++] = dn.getDfsUsedPercent();
7079        totalDfsUsed += dn.getDfsUsedPercent();
7080      }
7081      totalDfsUsed /= live.size();
7082      Arrays.sort(usages);
7083      median = usages[usages.length / 2];
7084      max = usages[usages.length - 1];
7085      min = usages[0];
7086
7087      for (i = 0; i < usages.length; i++) {
7088        dev += (usages[i] - totalDfsUsed) * (usages[i] - totalDfsUsed);
7089      }
7090      dev = (float) Math.sqrt(dev / usages.length);
7091    }
7092
7093    final Map<String, Object> innerInfo = new HashMap<String, Object>();
7094    innerInfo.put("min", StringUtils.format("%.2f%%", min));
7095    innerInfo.put("median", StringUtils.format("%.2f%%", median));
7096    innerInfo.put("max", StringUtils.format("%.2f%%", max));
7097    innerInfo.put("stdDev", StringUtils.format("%.2f%%", dev));
7098    info.put("nodeUsage", innerInfo);
7099
7100    return JSON.toString(info);
7101  }
7102
7103  @Override  // NameNodeMXBean
7104  public String getNameJournalStatus() {
7105    List<Map<String, String>> jasList = new ArrayList<Map<String, String>>();
7106    FSEditLog log = getFSImage().getEditLog();
7107    if (log != null) {
7108      // This flag can be false because we cannot hold a lock of FSEditLog
7109      // for metrics.
7110      boolean openForWrite = log.isOpenForWriteWithoutLock();
7111      for (JournalAndStream jas : log.getJournals()) {
7112        final Map<String, String> jasMap = new HashMap<String, String>();
7113        String manager = jas.getManager().toString();
7114
7115        jasMap.put("required", String.valueOf(jas.isRequired()));
7116        jasMap.put("disabled", String.valueOf(jas.isDisabled()));
7117        jasMap.put("manager", manager);
7118
7119        if (jas.isDisabled()) {
7120          jasMap.put("stream", "Failed");
7121        } else if (openForWrite) {
7122          EditLogOutputStream elos = jas.getCurrentStream();
7123          if (elos != null) {
7124            jasMap.put("stream", elos.generateReport());
7125          } else {
7126            jasMap.put("stream", "not currently writing");
7127          }
7128        } else {
7129          jasMap.put("stream", "open for read");
7130        }
7131        jasList.add(jasMap);
7132      }
7133    }
7134    return JSON.toString(jasList);
7135  }
7136
7137  @Override // NameNodeMxBean
7138  public String getJournalTransactionInfo() {
7139    Map<String, String> txnIdMap = new HashMap<String, String>();
7140    txnIdMap.put("LastAppliedOrWrittenTxId",
7141        Long.toString(this.getFSImage().getLastAppliedOrWrittenTxId()));
7142    txnIdMap.put("MostRecentCheckpointTxId",
7143        Long.toString(this.getFSImage().getMostRecentCheckpointTxId()));
7144    return JSON.toString(txnIdMap);
7145  }
7146  
7147  @Override  // NameNodeMXBean
7148  public String getNNStarted() {
7149    return getStartTime().toString();
7150  }
7151
7152  @Override  // NameNodeMXBean
7153  public String getCompileInfo() {
7154    return VersionInfo.getDate() + " by " + VersionInfo.getUser() +
7155        " from " + VersionInfo.getBranch();
7156  }
7157
7158  /** @return the block manager. */
7159  public BlockManager getBlockManager() {
7160    return blockManager;
7161  }
7162
7163  public BlockIdManager getBlockIdManager() {
7164    return blockIdManager;
7165  }
7166
7167  /** @return the FSDirectory. */
7168  @Override
7169  public FSDirectory getFSDirectory() {
7170    return dir;
7171  }
7172  /** Set the FSDirectory. */
7173  @VisibleForTesting
7174  public void setFSDirectory(FSDirectory dir) {
7175    this.dir = dir;
7176  }
7177  /** @return the cache manager. */
7178  public CacheManager getCacheManager() {
7179    return cacheManager;
7180  }
7181
7182  @Override  // NameNodeMXBean
7183  public String getCorruptFiles() {
7184    List<String> list = new ArrayList<String>();
7185    Collection<FSNamesystem.CorruptFileBlockInfo> corruptFileBlocks;
7186    try {
7187      corruptFileBlocks = listCorruptFileBlocks("/", null);
7188      int corruptFileCount = corruptFileBlocks.size();
7189      if (corruptFileCount != 0) {
7190        for (FSNamesystem.CorruptFileBlockInfo c : corruptFileBlocks) {
7191          list.add(c.toString());
7192        }
7193      }
7194    } catch (IOException e) {
7195      LOG.warn("Get corrupt file blocks returned error: " + e.getMessage());
7196    }
7197    return JSON.toString(list);
7198  }
7199
7200  @Override  //NameNodeMXBean
7201  public int getDistinctVersionCount() {
7202    return blockManager.getDatanodeManager().getDatanodesSoftwareVersions()
7203      .size();
7204  }
7205
7206  @Override  //NameNodeMXBean
7207  public Map<String, Integer> getDistinctVersions() {
7208    return blockManager.getDatanodeManager().getDatanodesSoftwareVersions();
7209  }
7210
7211  @Override  //NameNodeMXBean
7212  public String getSoftwareVersion() {
7213    return VersionInfo.getVersion();
7214  }
7215
7216  /**
7217   * Verifies that the given identifier and password are valid and match.
7218   * @param identifier Token identifier.
7219   * @param password Password in the token.
7220   */
7221  public synchronized void verifyToken(DelegationTokenIdentifier identifier,
7222      byte[] password) throws InvalidToken, RetriableException {
7223    try {
7224      getDelegationTokenSecretManager().verifyToken(identifier, password);
7225    } catch (InvalidToken it) {
7226      if (inTransitionToActive()) {
7227        throw new RetriableException(it);
7228      }
7229      throw it;
7230    }
7231  }
7232  
7233  @Override
7234  public boolean isGenStampInFuture(Block block) {
7235    return blockIdManager.isGenStampInFuture(block);
7236  }
7237
7238  @VisibleForTesting
7239  public EditLogTailer getEditLogTailer() {
7240    return editLogTailer;
7241  }
7242  
7243  @VisibleForTesting
7244  public void setEditLogTailerForTests(EditLogTailer tailer) {
7245    this.editLogTailer = tailer;
7246  }
7247  
7248  @VisibleForTesting
7249  void setFsLockForTests(ReentrantReadWriteLock lock) {
7250    this.fsLock.coarseLock = lock;
7251  }
7252  
7253  @VisibleForTesting
7254  public ReentrantReadWriteLock getFsLockForTests() {
7255    return fsLock.coarseLock;
7256  }
7257  
7258  @VisibleForTesting
7259  public ReentrantLock getCpLockForTests() {
7260    return cpLock;
7261  }
7262
7263  @VisibleForTesting
7264  public SafeModeInfo getSafeModeInfoForTests() {
7265    return safeMode;
7266  }
7267  
7268  @VisibleForTesting
7269  public void setNNResourceChecker(NameNodeResourceChecker nnResourceChecker) {
7270    this.nnResourceChecker = nnResourceChecker;
7271  }
7272
7273  public SnapshotManager getSnapshotManager() {
7274    return snapshotManager;
7275  }
7276  
7277  /** Allow snapshot on a directory. */
7278  void allowSnapshot(String path) throws IOException {
7279    checkOperation(OperationCategory.WRITE);
7280    final String operationName = "allowSnapshot";
7281    boolean success = false;
7282    writeLock();
7283    try {
7284      checkOperation(OperationCategory.WRITE);
7285      checkNameNodeSafeMode("Cannot allow snapshot for " + path);
7286      checkSuperuserPrivilege();
7287      FSDirSnapshotOp.allowSnapshot(dir, snapshotManager, path);
7288      success = true;
7289    } finally {
7290      writeUnlock(operationName);
7291    }
7292    getEditLog().logSync();
7293    logAuditEvent(success, operationName, path, null, null);
7294  }
7295  
7296  /** Disallow snapshot on a directory. */
7297  void disallowSnapshot(String path) throws IOException {
7298    checkOperation(OperationCategory.WRITE);
7299    final String operationName = "disallowSnapshot";
7300    boolean success = false;
7301    writeLock();
7302    try {
7303      checkOperation(OperationCategory.WRITE);
7304      checkNameNodeSafeMode("Cannot disallow snapshot for " + path);
7305      checkSuperuserPrivilege();
7306      FSDirSnapshotOp.disallowSnapshot(dir, snapshotManager, path);
7307      success = true;
7308    } finally {
7309      writeUnlock(operationName);
7310    }
7311    getEditLog().logSync();
7312    logAuditEvent(success, operationName, path, null, null);
7313  }
7314  
7315  /**
7316   * Create a snapshot
7317   * @param snapshotRoot The directory path where the snapshot is taken
7318   * @param snapshotName The name of the snapshot
7319   */
7320  String createSnapshot(String snapshotRoot, String snapshotName,
7321                        boolean logRetryCache) throws IOException {
7322    final String operationName = "createSnapshot";
7323    String snapshotPath = null;
7324    writeLock();
7325    try {
7326      checkOperation(OperationCategory.WRITE);
7327      checkNameNodeSafeMode("Cannot create snapshot for " + snapshotRoot);
7328      snapshotPath = FSDirSnapshotOp.createSnapshot(dir,
7329          snapshotManager, snapshotRoot, snapshotName, logRetryCache);
7330    } finally {
7331      writeUnlock(operationName);
7332    }
7333    getEditLog().logSync();
7334    logAuditEvent(snapshotPath != null, operationName, snapshotRoot,
7335        snapshotPath, null);
7336    return snapshotPath;
7337  }
7338  
7339  /**
7340   * Rename a snapshot
7341   * @param path The directory path where the snapshot was taken
7342   * @param snapshotOldName Old snapshot name
7343   * @param snapshotNewName New snapshot name
7344   * @throws SafeModeException
7345   * @throws IOException 
7346   */
7347  void renameSnapshot(
7348      String path, String snapshotOldName, String snapshotNewName,
7349      boolean logRetryCache) throws IOException {
7350    final String operationName = "renameSnapshot";
7351    boolean success = false;
7352    writeLock();
7353    try {
7354      checkOperation(OperationCategory.WRITE);
7355      checkNameNodeSafeMode("Cannot rename snapshot for " + path);
7356      FSDirSnapshotOp.renameSnapshot(dir, snapshotManager, path,
7357          snapshotOldName, snapshotNewName, logRetryCache);
7358      success = true;
7359    } finally {
7360      writeUnlock(operationName);
7361    }
7362    getEditLog().logSync();
7363    String oldSnapshotRoot = Snapshot.getSnapshotPath(path, snapshotOldName);
7364    String newSnapshotRoot = Snapshot.getSnapshotPath(path, snapshotNewName);
7365    logAuditEvent(success, operationName, oldSnapshotRoot,
7366        newSnapshotRoot, null);
7367  }
7368  
7369  /**
7370   * Get the list of snapshottable directories that are owned 
7371   * by the current user. Return all the snapshottable directories if the 
7372   * current user is a super user.
7373   * @return The list of all the current snapshottable directories
7374   * @throws IOException
7375   */
7376  public SnapshottableDirectoryStatus[] getSnapshottableDirListing()
7377      throws IOException {
7378    final String operationName = "listSnapshottableDirectory";
7379    SnapshottableDirectoryStatus[] status = null;
7380    checkOperation(OperationCategory.READ);
7381    boolean success = false;
7382    readLock();
7383    try {
7384      checkOperation(OperationCategory.READ);
7385      status = FSDirSnapshotOp.getSnapshottableDirListing(dir, snapshotManager);
7386      success = true;
7387    } finally {
7388      readUnlock(operationName);
7389    }
7390    logAuditEvent(success, operationName, null, null, null);
7391    return status;
7392  }
7393  
7394  /**
7395   * Get the difference between two snapshots (or between a snapshot and the
7396   * current status) of a snapshottable directory.
7397   * 
7398   * @param path The full path of the snapshottable directory.
7399   * @param fromSnapshot Name of the snapshot to calculate the diff from. Null
7400   *          or empty string indicates the current tree.
7401   * @param toSnapshot Name of the snapshot to calculated the diff to. Null or
7402   *          empty string indicates the current tree.
7403   * @return A report about the difference between {@code fromSnapshot} and 
7404   *         {@code toSnapshot}. Modified/deleted/created/renamed files and 
7405   *         directories belonging to the snapshottable directories are listed 
7406   *         and labeled as M/-/+/R respectively. 
7407   * @throws IOException
7408   */
7409  SnapshotDiffReport getSnapshotDiffReport(String path,
7410      String fromSnapshot, String toSnapshot) throws IOException {
7411    final String operationName = "computeSnapshotDiff";
7412    SnapshotDiffReport diffs = null;
7413    checkOperation(OperationCategory.READ);
7414    readLock();
7415    try {
7416      checkOperation(OperationCategory.READ);
7417      diffs = FSDirSnapshotOp.getSnapshotDiffReport(dir, snapshotManager,
7418          path, fromSnapshot, toSnapshot);
7419    } finally {
7420      readUnlock(operationName);
7421    }
7422
7423    logAuditEvent(diffs != null, operationName, null, null, null);
7424    return diffs;
7425  }
7426  
7427  /**
7428   * Delete a snapshot of a snapshottable directory
7429   * @param snapshotRoot The snapshottable directory
7430   * @param snapshotName The name of the to-be-deleted snapshot
7431   * @throws SafeModeException
7432   * @throws IOException
7433   */
7434  void deleteSnapshot(String snapshotRoot, String snapshotName,
7435      boolean logRetryCache) throws IOException {
7436    final String operationName = "deleteSnapshot";
7437    boolean success = false;
7438    writeLock();
7439    BlocksMapUpdateInfo blocksToBeDeleted = null;
7440    try {
7441      checkOperation(OperationCategory.WRITE);
7442      checkNameNodeSafeMode("Cannot delete snapshot for " + snapshotRoot);
7443
7444      blocksToBeDeleted = FSDirSnapshotOp.deleteSnapshot(dir, snapshotManager,
7445          snapshotRoot, snapshotName, logRetryCache);
7446      success = true;
7447    } finally {
7448      writeUnlock(operationName);
7449    }
7450    getEditLog().logSync();
7451
7452    // Breaking the pattern as removing blocks have to happen outside of the
7453    // global lock
7454    if (blocksToBeDeleted != null) {
7455      removeBlocks(blocksToBeDeleted);
7456    }
7457
7458    String rootPath = Snapshot.getSnapshotPath(snapshotRoot, snapshotName);
7459    logAuditEvent(success, operationName, rootPath, null, null);
7460  }
7461
7462  /**
7463   * Remove a list of INodeDirectorySnapshottable from the SnapshotManager
7464   * @param toRemove the list of INodeDirectorySnapshottable to be removed
7465   */
7466  void removeSnapshottableDirs(List<INodeDirectory> toRemove) {
7467    if (snapshotManager != null) {
7468      snapshotManager.removeSnapshottable(toRemove);
7469    }
7470  }
7471
7472  RollingUpgradeInfo queryRollingUpgrade() throws IOException {
7473    checkSuperuserPrivilege();
7474    checkOperation(OperationCategory.READ);
7475    readLock();
7476    try {
7477      if (!isRollingUpgrade()) {
7478        return null;
7479      }
7480      Preconditions.checkNotNull(rollingUpgradeInfo);
7481      boolean hasRollbackImage = this.getFSImage().hasRollbackFSImage();
7482      rollingUpgradeInfo.setCreatedRollbackImages(hasRollbackImage);
7483      return rollingUpgradeInfo;
7484    } finally {
7485      readUnlock("queryRollingUpgrade");
7486    }
7487  }
7488
7489  RollingUpgradeInfo startRollingUpgrade() throws IOException {
7490    final String operationName = "startRollingUpgrade";
7491    checkSuperuserPrivilege();
7492    checkOperation(OperationCategory.WRITE);
7493    writeLock();
7494    try {
7495      checkOperation(OperationCategory.WRITE);
7496      if (isRollingUpgrade()) {
7497        return rollingUpgradeInfo;
7498      }
7499      long startTime = now();
7500      if (!haEnabled) { // for non-HA, we require NN to be in safemode
7501        startRollingUpgradeInternalForNonHA(startTime);
7502      } else { // for HA, NN cannot be in safemode
7503        checkNameNodeSafeMode("Failed to start rolling upgrade");
7504        startRollingUpgradeInternal(startTime);
7505      }
7506
7507      getEditLog().logStartRollingUpgrade(rollingUpgradeInfo.getStartTime());
7508      if (haEnabled) {
7509        // roll the edit log to make sure the standby NameNode can tail
7510        getFSImage().rollEditLog();
7511      }
7512    } finally {
7513      writeUnlock(operationName);
7514    }
7515
7516    getEditLog().logSync();
7517    if (auditLog.isInfoEnabled() && isExternalInvocation()) {
7518      logAuditEvent(true, operationName, null, null, null);
7519    }
7520    return rollingUpgradeInfo;
7521  }
7522
7523  /**
7524   * Update internal state to indicate that a rolling upgrade is in progress.
7525   * @param startTime rolling upgrade start time
7526   */
7527  void startRollingUpgradeInternal(long startTime)
7528      throws IOException {
7529    checkRollingUpgrade("start rolling upgrade");
7530    getFSImage().checkUpgrade();
7531    setRollingUpgradeInfo(false, startTime);
7532  }
7533
7534  /**
7535   * Update internal state to indicate that a rolling upgrade is in progress for
7536   * non-HA setup. This requires the namesystem is in SafeMode and after doing a
7537   * checkpoint for rollback the namesystem will quit the safemode automatically 
7538   */
7539  private void startRollingUpgradeInternalForNonHA(long startTime)
7540      throws IOException {
7541    Preconditions.checkState(!haEnabled);
7542    if (!isInSafeMode()) {
7543      throw new IOException("Safe mode should be turned ON "
7544          + "in order to create namespace image.");
7545    }
7546    checkRollingUpgrade("start rolling upgrade");
7547    getFSImage().checkUpgrade();
7548    // in non-HA setup, we do an extra checkpoint to generate a rollback image
7549    getFSImage().saveNamespace(this, NameNodeFile.IMAGE_ROLLBACK, null);
7550    LOG.info("Successfully saved namespace for preparing rolling upgrade.");
7551
7552    // leave SafeMode automatically
7553    setSafeMode(SafeModeAction.SAFEMODE_LEAVE);
7554    setRollingUpgradeInfo(true, startTime);
7555  }
7556
7557  void setRollingUpgradeInfo(boolean createdRollbackImages, long startTime) {
7558    rollingUpgradeInfo = new RollingUpgradeInfo(blockPoolId,
7559        createdRollbackImages, startTime, 0L);
7560  }
7561
7562  public void setCreatedRollbackImages(boolean created) {
7563    if (rollingUpgradeInfo != null) {
7564      rollingUpgradeInfo.setCreatedRollbackImages(created);
7565    }
7566  }
7567
7568  public RollingUpgradeInfo getRollingUpgradeInfo() {
7569    return rollingUpgradeInfo;
7570  }
7571
7572  public boolean isNeedRollbackFsImage() {
7573    return needRollbackFsImage;
7574  }
7575
7576  public void setNeedRollbackFsImage(boolean needRollbackFsImage) {
7577    this.needRollbackFsImage = needRollbackFsImage;
7578  }
7579
7580  @Override  // NameNodeMXBean
7581  public RollingUpgradeInfo.Bean getRollingUpgradeStatus() {
7582    if (!isRollingUpgrade()) {
7583      return null;
7584    }
7585    RollingUpgradeInfo upgradeInfo = getRollingUpgradeInfo();
7586    if (upgradeInfo.createdRollbackImages()) {
7587      return new RollingUpgradeInfo.Bean(upgradeInfo);
7588    }
7589    readLock();
7590    try {
7591      // check again after acquiring the read lock.
7592      upgradeInfo = getRollingUpgradeInfo();
7593      if (upgradeInfo == null) {
7594        return null;
7595      }
7596      if (!upgradeInfo.createdRollbackImages()) {
7597        boolean hasRollbackImage = this.getFSImage().hasRollbackFSImage();
7598        upgradeInfo.setCreatedRollbackImages(hasRollbackImage);
7599      }
7600    } catch (IOException ioe) {
7601      LOG.warn("Encountered exception setting Rollback Image", ioe);
7602    } finally {
7603      readUnlock("getRollingUpgradeStatus");
7604    }
7605    return new RollingUpgradeInfo.Bean(upgradeInfo);
7606  }
7607
7608  /** Is rolling upgrade in progress? */
7609  public boolean isRollingUpgrade() {
7610    return rollingUpgradeInfo != null && !rollingUpgradeInfo.isFinalized();
7611  }
7612
7613  void checkRollingUpgrade(String action) throws RollingUpgradeException {
7614    if (isRollingUpgrade()) {
7615      throw new RollingUpgradeException("Failed to " + action
7616          + " since a rolling upgrade is already in progress."
7617          + " Existing rolling upgrade info:\n" + rollingUpgradeInfo);
7618    }
7619  }
7620
7621  RollingUpgradeInfo finalizeRollingUpgrade() throws IOException {
7622    final String operationName = "finalizeRollingUpgrade";
7623    checkSuperuserPrivilege();
7624    checkOperation(OperationCategory.WRITE);
7625    writeLock();
7626    try {
7627      checkOperation(OperationCategory.WRITE);
7628      if (!isRollingUpgrade()) {
7629        return null;
7630      }
7631      checkNameNodeSafeMode("Failed to finalize rolling upgrade");
7632
7633      finalizeRollingUpgradeInternal(now());
7634      getEditLog().logFinalizeRollingUpgrade(rollingUpgradeInfo.getFinalizeTime());
7635      if (haEnabled) {
7636        // roll the edit log to make sure the standby NameNode can tail
7637        getFSImage().rollEditLog();
7638      }
7639      getFSImage().updateStorageVersion();
7640      getFSImage().renameCheckpoint(NameNodeFile.IMAGE_ROLLBACK,
7641          NameNodeFile.IMAGE);
7642    } finally {
7643      writeUnlock(operationName);
7644    }
7645
7646    if (!haEnabled) {
7647      // Sync not needed for ha since the edit was rolled after logging.
7648      getEditLog().logSync();
7649    }
7650
7651    if (auditLog.isInfoEnabled() && isExternalInvocation()) {
7652      logAuditEvent(true, operationName, null, null, null);
7653    }
7654    return rollingUpgradeInfo;
7655  }
7656
7657  void finalizeRollingUpgradeInternal(long finalizeTime) {
7658    // Set the finalize time
7659    rollingUpgradeInfo.finalize(finalizeTime);
7660  }
7661
7662  long addCacheDirective(CacheDirectiveInfo directive,
7663                         EnumSet<CacheFlag> flags, boolean logRetryCache)
7664      throws IOException {
7665    final String operationName = "addCacheDirective";
7666    CacheDirectiveInfo effectiveDirective = null;
7667    if (!flags.contains(CacheFlag.FORCE)) {
7668      cacheManager.waitForRescanIfNeeded();
7669    }
7670    writeLock();
7671    try {
7672      checkOperation(OperationCategory.WRITE);
7673      if (isInSafeMode()) {
7674        throw new SafeModeException(
7675            "Cannot add cache directive", safeMode);
7676      }
7677      effectiveDirective = FSNDNCacheOp.addCacheDirective(this, cacheManager,
7678          directive, flags, logRetryCache);
7679    } finally {
7680      writeUnlock(operationName);
7681      boolean success = effectiveDirective != null;
7682      if (success) {
7683        getEditLog().logSync();
7684      }
7685
7686      String effectiveDirectiveStr = effectiveDirective != null ?
7687          effectiveDirective.toString() : null;
7688      logAuditEvent(success, operationName, effectiveDirectiveStr,
7689          null, null);
7690    }
7691    return effectiveDirective != null ? effectiveDirective.getId() : 0;
7692  }
7693
7694  void modifyCacheDirective(CacheDirectiveInfo directive,
7695      EnumSet<CacheFlag> flags, boolean logRetryCache) throws IOException {
7696    final String operationName = "modifyCacheDirective";
7697    boolean success = false;
7698    if (!flags.contains(CacheFlag.FORCE)) {
7699      cacheManager.waitForRescanIfNeeded();
7700    }
7701    writeLock();
7702    try {
7703      checkOperation(OperationCategory.WRITE);
7704      if (isInSafeMode()) {
7705        throw new SafeModeException(
7706            "Cannot add cache directive", safeMode);
7707      }
7708      FSNDNCacheOp.modifyCacheDirective(this, cacheManager, directive, flags,
7709          logRetryCache);
7710      success = true;
7711    } finally {
7712      writeUnlock(operationName);
7713      if (success) {
7714        getEditLog().logSync();
7715      }
7716      String idStr = "{id: " + directive.getId().toString() + "}";
7717      logAuditEvent(success, "modifyCacheDirective", idStr,
7718          directive.toString(), null);
7719    }
7720  }
7721
7722  void removeCacheDirective(long id, boolean logRetryCache) throws IOException {
7723    final String operationName = "removeCacheDirective";
7724    boolean success = false;
7725    writeLock();
7726    try {
7727      checkOperation(OperationCategory.WRITE);
7728      if (isInSafeMode()) {
7729        throw new SafeModeException(
7730            "Cannot remove cache directives", safeMode);
7731      }
7732      FSNDNCacheOp.removeCacheDirective(this, cacheManager, id, logRetryCache);
7733      success = true;
7734    } finally {
7735      writeUnlock(operationName);
7736      String idStr = "{id: " + Long.toString(id) + "}";
7737      logAuditEvent(success, operationName, idStr, null,
7738          null);
7739    }
7740    getEditLog().logSync();
7741  }
7742
7743  BatchedListEntries<CacheDirectiveEntry> listCacheDirectives(
7744      long startId, CacheDirectiveInfo filter) throws IOException {
7745    final String operationName = "listCacheDirectives";
7746    checkOperation(OperationCategory.READ);
7747    BatchedListEntries<CacheDirectiveEntry> results;
7748    cacheManager.waitForRescanIfNeeded();
7749    readLock();
7750    boolean success = false;
7751    try {
7752      checkOperation(OperationCategory.READ);
7753      results = FSNDNCacheOp.listCacheDirectives(this, cacheManager, startId,
7754          filter);
7755      success = true;
7756    } finally {
7757      readUnlock(operationName);
7758      logAuditEvent(success, operationName, filter.toString(), null,
7759          null);
7760    }
7761    return results;
7762  }
7763
7764  void addCachePool(CachePoolInfo req, boolean logRetryCache)
7765      throws IOException {
7766    final String operationName = "addCachePool";
7767    writeLock();
7768    boolean success = false;
7769    String poolInfoStr = null;
7770    try {
7771      checkOperation(OperationCategory.WRITE);
7772      if (isInSafeMode()) {
7773        throw new SafeModeException(
7774            "Cannot add cache pool " + req.getPoolName(), safeMode);
7775      }
7776      CachePoolInfo info = FSNDNCacheOp.addCachePool(this, cacheManager, req,
7777          logRetryCache);
7778      poolInfoStr = info.toString();
7779      success = true;
7780    } finally {
7781      writeUnlock(operationName);
7782      logAuditEvent(success, operationName, poolInfoStr, null, null);
7783    }
7784    
7785    getEditLog().logSync();
7786  }
7787
7788  void modifyCachePool(CachePoolInfo req, boolean logRetryCache)
7789      throws IOException {
7790    final String operationName = "modifyCachePool";
7791    writeLock();
7792    boolean success = false;
7793    try {
7794      checkOperation(OperationCategory.WRITE);
7795      if (isInSafeMode()) {
7796        throw new SafeModeException(
7797            "Cannot modify cache pool " + req.getPoolName(), safeMode);
7798      }
7799      FSNDNCacheOp.modifyCachePool(this, cacheManager, req, logRetryCache);
7800      success = true;
7801    } finally {
7802      writeUnlock(operationName);
7803      String poolNameStr = "{poolName: " +
7804          (req == null ? null : req.getPoolName()) + "}";
7805      logAuditEvent(success, operationName, poolNameStr,
7806                    req == null ? null : req.toString(), null);
7807    }
7808
7809    getEditLog().logSync();
7810  }
7811
7812  void removeCachePool(String cachePoolName, boolean logRetryCache)
7813      throws IOException {
7814    final String operationName = "removeCachePool";
7815    writeLock();
7816    boolean success = false;
7817    try {
7818      checkOperation(OperationCategory.WRITE);
7819      if (isInSafeMode()) {
7820        throw new SafeModeException(
7821            "Cannot remove cache pool " + cachePoolName, safeMode);
7822      }
7823      FSNDNCacheOp.removeCachePool(this, cacheManager, cachePoolName,
7824          logRetryCache);
7825      success = true;
7826    } finally {
7827      writeUnlock(operationName);
7828      String poolNameStr = "{poolName: " + cachePoolName + "}";
7829      logAuditEvent(success, operationName, poolNameStr, null, null);
7830    }
7831    
7832    getEditLog().logSync();
7833  }
7834
7835  BatchedListEntries<CachePoolEntry> listCachePools(String prevKey)
7836      throws IOException {
7837    final String operationName = "listCachePools";
7838    BatchedListEntries<CachePoolEntry> results;
7839    checkOperation(OperationCategory.READ);
7840    boolean success = false;
7841    cacheManager.waitForRescanIfNeeded();
7842    readLock();
7843    try {
7844      checkOperation(OperationCategory.READ);
7845      results = FSNDNCacheOp.listCachePools(this, cacheManager, prevKey);
7846      success = true;
7847    } finally {
7848      readUnlock(operationName);
7849      logAuditEvent(success, operationName, null, null, null);
7850    }
7851    return results;
7852  }
7853
7854  void modifyAclEntries(final String src, List<AclEntry> aclSpec)
7855      throws IOException {
7856    final String operationName = "modifyAclEntries";
7857    HdfsFileStatus auditStat = null;
7858    checkOperation(OperationCategory.WRITE);
7859    writeLock();
7860    try {
7861      checkOperation(OperationCategory.WRITE);
7862      checkNameNodeSafeMode("Cannot modify ACL entries on " + src);
7863      auditStat = FSDirAclOp.modifyAclEntries(dir, src, aclSpec);
7864    } catch (AccessControlException e) {
7865      logAuditEvent(false, operationName, src);
7866      throw e;
7867    } finally {
7868      writeUnlock(operationName);
7869    }
7870    getEditLog().logSync();
7871    logAuditEvent(true, operationName, src, null, auditStat);
7872  }
7873
7874  void removeAclEntries(final String src, List<AclEntry> aclSpec)
7875      throws IOException {
7876    final String operationName = "removeAclEntries";
7877    checkOperation(OperationCategory.WRITE);
7878    HdfsFileStatus auditStat = null;
7879    writeLock();
7880    try {
7881      checkOperation(OperationCategory.WRITE);
7882      checkNameNodeSafeMode("Cannot remove ACL entries on " + src);
7883      auditStat = FSDirAclOp.removeAclEntries(dir, src, aclSpec);
7884    } catch (AccessControlException e) {
7885      logAuditEvent(false, operationName, src);
7886      throw e;
7887    } finally {
7888      writeUnlock(operationName);
7889    }
7890    getEditLog().logSync();
7891    logAuditEvent(true, operationName, src, null, auditStat);
7892  }
7893
7894  void removeDefaultAcl(final String src) throws IOException {
7895    final String operationName = "removeDefaultAcl";
7896    HdfsFileStatus auditStat = null;
7897    checkOperation(OperationCategory.WRITE);
7898    writeLock();
7899    try {
7900      checkOperation(OperationCategory.WRITE);
7901      checkNameNodeSafeMode("Cannot remove default ACL entries on " + src);
7902      auditStat = FSDirAclOp.removeDefaultAcl(dir, src);
7903    } catch (AccessControlException e) {
7904      logAuditEvent(false, operationName, src);
7905      throw e;
7906    } finally {
7907      writeUnlock(operationName);
7908    }
7909    getEditLog().logSync();
7910    logAuditEvent(true, operationName, src, null, auditStat);
7911  }
7912
7913  void removeAcl(final String src) throws IOException {
7914    final String operationName = "removeAcl";
7915    HdfsFileStatus auditStat = null;
7916    checkOperation(OperationCategory.WRITE);
7917    writeLock();
7918    try {
7919      checkOperation(OperationCategory.WRITE);
7920      checkNameNodeSafeMode("Cannot remove ACL on " + src);
7921      auditStat = FSDirAclOp.removeAcl(dir, src);
7922    } catch (AccessControlException e) {
7923      logAuditEvent(false, operationName, src);
7924      throw e;
7925    } finally {
7926      writeUnlock(operationName);
7927    }
7928    getEditLog().logSync();
7929    logAuditEvent(true, operationName, src, null, auditStat);
7930  }
7931
7932  void setAcl(final String src, List<AclEntry> aclSpec) throws IOException {
7933    final String operationName = "setAcl";
7934    HdfsFileStatus auditStat = null;
7935    checkOperation(OperationCategory.WRITE);
7936    writeLock();
7937    try {
7938      checkOperation(OperationCategory.WRITE);
7939      checkNameNodeSafeMode("Cannot set ACL on " + src);
7940      auditStat = FSDirAclOp.setAcl(dir, src, aclSpec);
7941    } catch (AccessControlException e) {
7942      logAuditEvent(false, operationName, src);
7943      throw e;
7944    } finally {
7945      writeUnlock(operationName);
7946    }
7947    getEditLog().logSync();
7948    logAuditEvent(true, operationName, src, null, auditStat);
7949  }
7950
7951  AclStatus getAclStatus(String src) throws IOException {
7952    final String operationName = "getAclStatus";
7953    checkOperation(OperationCategory.READ);
7954    boolean success = false;
7955    readLock();
7956    try {
7957      checkOperation(OperationCategory.READ);
7958      final AclStatus ret = FSDirAclOp.getAclStatus(dir, src);
7959      success = true;
7960      return ret;
7961    } finally {
7962      readUnlock(operationName);
7963      logAuditEvent(success, operationName, src);
7964    }
7965  }
7966
7967  /**
7968   * Create an encryption zone on directory src using the specified key.
7969   *
7970   * @param src     the path of a directory which will be the root of the
7971   *                encryption zone. The directory must be empty.
7972   * @param keyName name of a key which must be present in the configured
7973   *                KeyProvider.
7974   * @throws AccessControlException  if the caller is not the superuser.
7975   * @throws UnresolvedLinkException if the path can't be resolved.
7976   * @throws SafeModeException       if the Namenode is in safe mode.
7977   */
7978  void createEncryptionZone(final String src, final String keyName,
7979                            boolean logRetryCache)
7980    throws IOException, UnresolvedLinkException,
7981      SafeModeException, AccessControlException {
7982    try {
7983      if (provider == null) {
7984        throw new IOException(
7985            "Can't create an encryption zone for " + src +
7986            " since no key provider is available.");
7987      }
7988      if (keyName == null || keyName.isEmpty()) {
7989        throw new IOException("Must specify a key name when creating an " +
7990            "encryption zone");
7991      }
7992      KeyProvider.Metadata metadata = provider.getMetadata(keyName);
7993      if (metadata == null) {
7994        /*
7995         * It would be nice if we threw something more specific than
7996         * IOException when the key is not found, but the KeyProvider API
7997         * doesn't provide for that. If that API is ever changed to throw
7998         * something more specific (e.g. UnknownKeyException) then we can
7999         * update this to match it, or better yet, just rethrow the
8000         * KeyProvider's exception.
8001         */
8002        throw new IOException("Key " + keyName + " doesn't exist.");
8003      }
8004      // If the provider supports pool for EDEKs, this will fill in the pool
8005      generateEncryptedDataEncryptionKey(keyName);
8006      createEncryptionZoneInt(src, metadata.getCipher(),
8007          keyName, logRetryCache);
8008    } catch (AccessControlException e) {
8009      logAuditEvent(false, "createEncryptionZone", src);
8010      throw e;
8011    }
8012  }
8013
8014  private void createEncryptionZoneInt(final String srcArg, String cipher,
8015      String keyName, final boolean logRetryCache) throws IOException {
8016    final String operationName = "createEncryptionZone";
8017    String src = srcArg;
8018    HdfsFileStatus resultingStat = null;
8019    checkSuperuserPrivilege();
8020    FSPermissionChecker pc = getPermissionChecker();
8021    writeLock();
8022    try {
8023      checkSuperuserPrivilege();
8024      checkOperation(OperationCategory.WRITE);
8025      checkNameNodeSafeMode("Cannot create encryption zone on " + src);
8026      final INodesInPath iip = dir.resolvePathForWrite(pc, src);
8027      src = iip.getPath();
8028
8029      final CipherSuite suite = CipherSuite.convert(cipher);
8030      // For now this is hardcoded, as we only support one method.
8031      final CryptoProtocolVersion version =
8032          CryptoProtocolVersion.ENCRYPTION_ZONES;
8033      final XAttr ezXAttr = dir.createEncryptionZone(src, suite,
8034          version, keyName);
8035      List<XAttr> xAttrs = Lists.newArrayListWithCapacity(1);
8036      xAttrs.add(ezXAttr);
8037      getEditLog().logSetXAttrs(src, xAttrs, logRetryCache);
8038      resultingStat = dir.getAuditFileInfo(iip);
8039    } finally {
8040      writeUnlock(operationName);
8041    }
8042    getEditLog().logSync();
8043    logAuditEvent(true, operationName, srcArg, null, resultingStat);
8044  }
8045
8046  /**
8047   * Get the encryption zone for the specified path.
8048   *
8049   * @param srcArg the path of a file or directory to get the EZ for.
8050   * @return the EZ of the of the path or null if none.
8051   * @throws AccessControlException  if the caller is not the superuser.
8052   * @throws UnresolvedLinkException if the path can't be resolved.
8053   */
8054  EncryptionZone getEZForPath(final String srcArg)
8055    throws AccessControlException, UnresolvedLinkException, IOException {
8056    String src = srcArg;
8057    final String operationName = "getEZForPath";
8058    HdfsFileStatus resultingStat = null;
8059    boolean success = false;
8060    final FSPermissionChecker pc = getPermissionChecker();
8061    checkOperation(OperationCategory.READ);
8062    readLock();
8063    try {
8064      checkOperation(OperationCategory.READ);
8065      INodesInPath iip = dir.resolvePath(pc, src);
8066      if (isPermissionEnabled) {
8067        dir.checkPathAccess(pc, iip, FsAction.READ);
8068      }
8069      final EncryptionZone ret = dir.getEZForPath(iip);
8070      resultingStat = dir.getAuditFileInfo(iip);
8071      success = true;
8072      return ret;
8073    } finally {
8074      readUnlock(operationName);
8075      logAuditEvent(success, operationName, srcArg, null, resultingStat);
8076    }
8077  }
8078
8079  BatchedListEntries<EncryptionZone> listEncryptionZones(long prevId)
8080      throws IOException {
8081    final String operationName = "listEncryptionZones";
8082    boolean success = false;
8083    checkSuperuserPrivilege();
8084    checkOperation(OperationCategory.READ);
8085    readLock();
8086    try {
8087      checkSuperuserPrivilege();
8088      checkOperation(OperationCategory.READ);
8089      final BatchedListEntries<EncryptionZone> ret =
8090          dir.listEncryptionZones(prevId);
8091      success = true;
8092      return ret;
8093    } finally {
8094      readUnlock(operationName);
8095      logAuditEvent(success, operationName, null);
8096    }
8097  }
8098
8099  void setXAttr(String src, XAttr xAttr, EnumSet<XAttrSetFlag> flag,
8100                boolean logRetryCache)
8101      throws IOException {
8102    final String operationName = "setXAttr";
8103    HdfsFileStatus auditStat = null;
8104    writeLock();
8105    try {
8106      checkOperation(OperationCategory.WRITE);
8107      checkNameNodeSafeMode("Cannot set XAttr on " + src);
8108      auditStat = FSDirXAttrOp.setXAttr(dir, src, xAttr, flag, logRetryCache);
8109    } catch (AccessControlException e) {
8110      logAuditEvent(false, operationName, src);
8111      throw e;
8112    } finally {
8113      writeUnlock(operationName);
8114    }
8115    getEditLog().logSync();
8116    logAuditEvent(true, operationName, src, null, auditStat);
8117  }
8118
8119  List<XAttr> getXAttrs(final String src, List<XAttr> xAttrs)
8120      throws IOException {
8121    final String operationName = "getXAttrs";
8122    checkOperation(OperationCategory.READ);
8123    readLock();
8124    try {
8125      checkOperation(OperationCategory.READ);
8126      return FSDirXAttrOp.getXAttrs(dir, src, xAttrs);
8127    } catch (AccessControlException e) {
8128      logAuditEvent(false, operationName, src);
8129      throw e;
8130    } finally {
8131      readUnlock(operationName);
8132    }
8133  }
8134
8135  List<XAttr> listXAttrs(String src) throws IOException {
8136    final String operationName = "listXAttrs";
8137    checkOperation(OperationCategory.READ);
8138    readLock();
8139    try {
8140      checkOperation(OperationCategory.READ);
8141      return FSDirXAttrOp.listXAttrs(dir, src);
8142    } catch (AccessControlException e) {
8143      logAuditEvent(false, operationName, src);
8144      throw e;
8145    } finally {
8146      readUnlock(operationName);
8147    }
8148  }
8149
8150  void removeXAttr(String src, XAttr xAttr, boolean logRetryCache)
8151      throws IOException {
8152    final String operationName = "removeXAttr";
8153    HdfsFileStatus auditStat = null;
8154    writeLock();
8155    try {
8156      checkOperation(OperationCategory.WRITE);
8157      checkNameNodeSafeMode("Cannot remove XAttr entry on " + src);
8158      auditStat = FSDirXAttrOp.removeXAttr(dir, src, xAttr, logRetryCache);
8159    } catch (AccessControlException e) {
8160      logAuditEvent(false, operationName, src);
8161      throw e;
8162    } finally {
8163      writeUnlock(operationName);
8164    }
8165    getEditLog().logSync();
8166    logAuditEvent(true, operationName, src, null, auditStat);
8167  }
8168
8169  void checkAccess(String src, FsAction mode) throws IOException {
8170    final String operationName = "checkAccess";
8171    checkOperation(OperationCategory.READ);
8172    FSPermissionChecker pc = getPermissionChecker();
8173    readLock();
8174    try {
8175      checkOperation(OperationCategory.READ);
8176      final INodesInPath iip = dir.resolvePath(pc, src);
8177      src = iip.getPath();
8178      INode inode = iip.getLastINode();
8179      if (inode == null) {
8180        throw new FileNotFoundException("Path not found");
8181      }
8182      if (isPermissionEnabled) {
8183        dir.checkPathAccess(pc, iip, mode);
8184      }
8185    } catch (AccessControlException e) {
8186      logAuditEvent(false, operationName, src);
8187      throw e;
8188    } finally {
8189      readUnlock(operationName);
8190    }
8191  }
8192
8193  /**
8194   * Default AuditLogger implementation; used when no access logger is
8195   * defined in the config file. It can also be explicitly listed in the
8196   * config file.
8197   */
8198  private static class DefaultAuditLogger extends HdfsAuditLogger {
8199
8200    private boolean logTokenTrackingId;
8201
8202    @Override
8203    public void initialize(Configuration conf) {
8204      logTokenTrackingId = conf.getBoolean(
8205          DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_KEY,
8206          DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_DEFAULT);
8207    }
8208
8209    @Override
8210    public void logAuditEvent(boolean succeeded, String userName,
8211        InetAddress addr, String cmd, String src, String dst,
8212        FileStatus status, UserGroupInformation ugi,
8213        DelegationTokenSecretManager dtSecretManager) {
8214      if (auditLog.isInfoEnabled()) {
8215        final StringBuilder sb = auditBuffer.get();
8216        sb.setLength(0);
8217        sb.append("allowed=").append(succeeded).append("\t");
8218        sb.append("ugi=").append(userName).append("\t");
8219        sb.append("ip=").append(addr).append("\t");
8220        sb.append("cmd=").append(cmd).append("\t");
8221        sb.append("src=").append(src).append("\t");
8222        sb.append("dst=").append(dst).append("\t");
8223        if (null == status) {
8224          sb.append("perm=null");
8225        } else {
8226          sb.append("perm=");
8227          sb.append(status.getOwner()).append(":");
8228          sb.append(status.getGroup()).append(":");
8229          sb.append(status.getPermission());
8230        }
8231        if (logTokenTrackingId) {
8232          sb.append("\t").append("trackingId=");
8233          String trackingId = null;
8234          if (ugi != null && dtSecretManager != null
8235              && ugi.getAuthenticationMethod() == AuthenticationMethod.TOKEN) {
8236            for (TokenIdentifier tid: ugi.getTokenIdentifiers()) {
8237              if (tid instanceof DelegationTokenIdentifier) {
8238                DelegationTokenIdentifier dtid =
8239                    (DelegationTokenIdentifier)tid;
8240                trackingId = dtSecretManager.getTokenTrackingId(dtid);
8241                break;
8242              }
8243            }
8244          }
8245          sb.append(trackingId);
8246        }
8247        sb.append("\t").append("proto=");
8248        sb.append(NamenodeWebHdfsMethods.isWebHdfsInvocation() ? "webhdfs" : "rpc");
8249        logAuditMessage(sb.toString());
8250      }
8251    }
8252
8253    public void logAuditMessage(String message) {
8254      auditLog.info(message);
8255    }
8256  }
8257
8258  private static void enableAsyncAuditLog() {
8259    if (!(auditLog instanceof Log4JLogger)) {
8260      LOG.warn("Log4j is required to enable async auditlog");
8261      return;
8262    }
8263    Logger logger = ((Log4JLogger)auditLog).getLogger();
8264    @SuppressWarnings("unchecked")
8265    List<Appender> appenders = Collections.list(logger.getAllAppenders());
8266    // failsafe against trying to async it more than once
8267    if (!appenders.isEmpty() && !(appenders.get(0) instanceof AsyncAppender)) {
8268      AsyncAppender asyncAppender = new AsyncAppender();
8269      // change logger to have an async appender containing all the
8270      // previously configured appenders
8271      for (Appender appender : appenders) {
8272        logger.removeAppender(appender);
8273        asyncAppender.addAppender(appender);
8274      }
8275      logger.addAppender(asyncAppender);        
8276    }
8277  }
8278
8279}
8280