001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs.server.namenode;
019
020import static org.apache.hadoop.crypto.key.KeyProviderCryptoExtension.EncryptedKeyVersion;
021import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_TRASH_INTERVAL_DEFAULT;
022import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_TRASH_INTERVAL_KEY;
023import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_FILE_BUFFER_SIZE_DEFAULT;
024import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_FILE_BUFFER_SIZE_KEY;
025import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BLOCK_SIZE_DEFAULT;
026import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BLOCK_SIZE_KEY;
027import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BYTES_PER_CHECKSUM_DEFAULT;
028import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BYTES_PER_CHECKSUM_KEY;
029import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CHECKSUM_TYPE_DEFAULT;
030import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CHECKSUM_TYPE_KEY;
031import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_WRITE_PACKET_SIZE_DEFAULT;
032import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_WRITE_PACKET_SIZE_KEY;
033import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_ENCRYPT_DATA_TRANSFER_DEFAULT;
034import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_ENCRYPT_DATA_TRANSFER_KEY;
035import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_STANDBY_CHECKPOINTS_DEFAULT;
036import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_STANDBY_CHECKPOINTS_KEY;
037import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ACCESSTIME_PRECISION_DEFAULT;
038import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ACCESSTIME_PRECISION_KEY;
039import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOGGERS_KEY;
040import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_ASYNC_DEFAULT;
041import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_ASYNC_KEY;
042import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_DEFAULT;
043import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_KEY;
044import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_DEFAULT;
045import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_KEY;
046import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DEFAULT_AUDIT_LOGGER_NAME;
047import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_DEFAULT;
048import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_KEY;
049import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_DEFAULT;
050import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY;
051import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_DEFAULT;
052import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_KEY;
053import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_DEFAULT;
054import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_KEY;
055import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY;
056import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY;
057import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDIT_LOG_AUTOROLL_CHECK_INTERVAL_MS;
058import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDIT_LOG_AUTOROLL_CHECK_INTERVAL_MS_DEFAULT;
059import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDIT_LOG_AUTOROLL_MULTIPLIER_THRESHOLD;
060import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDIT_LOG_AUTOROLL_MULTIPLIER_THRESHOLD_DEFAULT;
061import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ENABLE_RETRY_CACHE_DEFAULT;
062import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ENABLE_RETRY_CACHE_KEY;
063import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_LAZY_PERSIST_FILE_SCRUB_INTERVAL_SEC;
064import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_LAZY_PERSIST_FILE_SCRUB_INTERVAL_SEC_DEFAULT;
065import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_INODE_ATTRIBUTES_PROVIDER_KEY;
066import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_MAX_OBJECTS_DEFAULT;
067import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_MAX_OBJECTS_KEY;
068import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_NAME_DIR_KEY;
069import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPLICATION_MIN_DEFAULT;
070import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPLICATION_MIN_KEY;
071import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPL_QUEUE_THRESHOLD_PCT_KEY;
072import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_DEFAULT;
073import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_KEY;
074import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_DEFAULT;
075import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_KEY;
076import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_DEFAULT;
077import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_KEY;
078import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_EXTENSION_KEY;
079import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_MIN_DATANODES_DEFAULT;
080import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY;
081import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_DEFAULT;
082import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_KEY;
083import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SHARED_EDITS_DIR_KEY;
084import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_ENABLED_DEFAULT;
085import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_ENABLED_KEY;
086import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_SUPERUSERGROUP_DEFAULT;
087import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_SUPERUSERGROUP_KEY;
088import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_REPLICATION_DEFAULT;
089import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_REPLICATION_KEY;
090import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_SUPPORT_APPEND_DEFAULT;
091import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_SUPPORT_APPEND_KEY;
092import static org.apache.hadoop.hdfs.server.common.HdfsServerConstants.SECURITY_XATTR_UNREADABLE_BY_SUPERUSER;
093import static org.apache.hadoop.util.Time.now;
094import static org.apache.hadoop.util.Time.monotonicNow;
095import static org.apache.hadoop.hdfs.server.namenode.top.metrics.TopMetrics.TOPMETRICS_METRICS_SOURCE_NAME;
096
097import java.io.BufferedWriter;
098import java.io.ByteArrayInputStream;
099import java.io.DataInput;
100import java.io.DataInputStream;
101import java.io.DataOutputStream;
102import java.io.File;
103import java.io.FileNotFoundException;
104import java.io.FileOutputStream;
105import java.io.IOException;
106import java.io.OutputStreamWriter;
107import java.io.PrintWriter;
108import java.io.StringWriter;
109import java.lang.management.ManagementFactory;
110import java.net.InetAddress;
111import java.net.URI;
112import java.security.GeneralSecurityException;
113import java.util.ArrayList;
114import java.util.Arrays;
115import java.util.Collection;
116import java.util.Collections;
117import java.util.Date;
118import java.util.EnumSet;
119import java.util.HashMap;
120import java.util.HashSet;
121import java.util.Iterator;
122import java.util.LinkedHashSet;
123import java.util.List;
124import java.util.Map;
125import java.util.Set;
126import java.util.TreeMap;
127import java.util.concurrent.TimeUnit;
128import java.util.concurrent.locks.Condition;
129import java.util.concurrent.locks.ReentrantLock;
130import java.util.concurrent.locks.ReentrantReadWriteLock;
131
132import javax.management.NotCompliantMBeanException;
133import javax.management.ObjectName;
134import javax.management.StandardMBean;
135
136import org.apache.commons.logging.Log;
137import org.apache.commons.logging.LogFactory;
138import org.apache.commons.logging.impl.Log4JLogger;
139import org.apache.hadoop.HadoopIllegalArgumentException;
140import org.apache.hadoop.classification.InterfaceAudience;
141import org.apache.hadoop.conf.Configuration;
142import org.apache.hadoop.crypto.CipherSuite;
143import org.apache.hadoop.crypto.CryptoProtocolVersion;
144import org.apache.hadoop.crypto.key.KeyProvider;
145import org.apache.hadoop.crypto.key.KeyProviderCryptoExtension;
146import org.apache.hadoop.fs.BatchedRemoteIterator.BatchedListEntries;
147import org.apache.hadoop.fs.CacheFlag;
148import org.apache.hadoop.fs.ContentSummary;
149import org.apache.hadoop.fs.CreateFlag;
150import org.apache.hadoop.fs.FileAlreadyExistsException;
151import org.apache.hadoop.fs.FileEncryptionInfo;
152import org.apache.hadoop.fs.FileStatus;
153import org.apache.hadoop.fs.FileSystem;
154import org.apache.hadoop.fs.FsServerDefaults;
155import org.apache.hadoop.fs.InvalidPathException;
156import org.apache.hadoop.fs.Options;
157import org.apache.hadoop.fs.ParentNotDirectoryException;
158import org.apache.hadoop.fs.Path;
159import org.apache.hadoop.fs.UnresolvedLinkException;
160import org.apache.hadoop.fs.XAttr;
161import org.apache.hadoop.fs.XAttrSetFlag;
162import org.apache.hadoop.fs.permission.AclEntry;
163import org.apache.hadoop.fs.permission.AclStatus;
164import org.apache.hadoop.fs.permission.FsAction;
165import org.apache.hadoop.fs.permission.FsPermission;
166import org.apache.hadoop.fs.permission.PermissionStatus;
167import org.apache.hadoop.fs.StorageType;
168import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
169import org.apache.hadoop.ha.ServiceFailedException;
170import org.apache.hadoop.hdfs.protocol.BlockStoragePolicy;
171import org.apache.hadoop.hdfs.DFSConfigKeys;
172import org.apache.hadoop.hdfs.DFSUtil;
173import org.apache.hadoop.hdfs.HAUtil;
174import org.apache.hadoop.hdfs.HdfsConfiguration;
175import org.apache.hadoop.hdfs.UnknownCryptoProtocolVersionException;
176import org.apache.hadoop.hdfs.XAttrHelper;
177import org.apache.hadoop.hdfs.protocol.AlreadyBeingCreatedException;
178import org.apache.hadoop.hdfs.protocol.Block;
179import org.apache.hadoop.hdfs.protocol.CacheDirectiveEntry;
180import org.apache.hadoop.hdfs.protocol.CacheDirectiveInfo;
181import org.apache.hadoop.hdfs.protocol.CachePoolEntry;
182import org.apache.hadoop.hdfs.protocol.CachePoolInfo;
183import org.apache.hadoop.hdfs.protocol.ClientProtocol;
184import org.apache.hadoop.hdfs.protocol.DatanodeID;
185import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
186import org.apache.hadoop.hdfs.protocol.DirectoryListing;
187import org.apache.hadoop.hdfs.protocol.EncryptionZone;
188import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
189import org.apache.hadoop.hdfs.protocol.HdfsConstants;
190import org.apache.hadoop.hdfs.protocol.LastBlockWithStatus;
191import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType;
192import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction;
193import org.apache.hadoop.hdfs.protocol.HdfsFileStatus;
194import org.apache.hadoop.hdfs.protocol.LocatedBlock;
195import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
196import org.apache.hadoop.hdfs.protocol.QuotaExceededException;
197import org.apache.hadoop.hdfs.protocol.RecoveryInProgressException;
198import org.apache.hadoop.hdfs.protocol.RollingUpgradeException;
199import org.apache.hadoop.hdfs.protocol.RollingUpgradeInfo;
200import org.apache.hadoop.hdfs.protocol.SnapshotAccessControlException;
201import org.apache.hadoop.hdfs.protocol.SnapshotDiffReport;
202import org.apache.hadoop.hdfs.protocol.SnapshottableDirectoryStatus;
203import org.apache.hadoop.hdfs.protocol.datatransfer.ReplaceDatanodeOnFailure;
204import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager;
205import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager.AccessMode;
206import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier;
207import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenSecretManager;
208import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenSecretManager.SecretManagerState;
209import org.apache.hadoop.hdfs.server.blockmanagement.BlockCollection;
210import org.apache.hadoop.hdfs.server.blockmanagement.BlockIdManager;
211import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoContiguous;
212import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoContiguousUnderConstruction;
213import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager;
214import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor;
215import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeManager;
216import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeStatistics;
217import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeStorageInfo;
218import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.BlockUCState;
219import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.NamenodeRole;
220import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.RollingUpgradeStartupOption;
221import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption;
222import org.apache.hadoop.hdfs.server.common.Storage;
223import org.apache.hadoop.hdfs.server.common.Storage.StorageDirType;
224import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
225import org.apache.hadoop.hdfs.server.common.Util;
226import org.apache.hadoop.hdfs.server.namenode.FsImageProto.SecretManagerSection;
227import org.apache.hadoop.hdfs.server.namenode.INode.BlocksMapUpdateInfo;
228import org.apache.hadoop.hdfs.server.namenode.JournalSet.JournalAndStream;
229import org.apache.hadoop.hdfs.server.namenode.LeaseManager.Lease;
230import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeFile;
231import org.apache.hadoop.hdfs.server.namenode.NameNode.OperationCategory;
232import org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer;
233import org.apache.hadoop.hdfs.server.namenode.ha.HAContext;
234import org.apache.hadoop.hdfs.server.namenode.ha.StandbyCheckpointer;
235import org.apache.hadoop.hdfs.server.namenode.metrics.FSNamesystemMBean;
236import org.apache.hadoop.hdfs.server.namenode.metrics.NameNodeMetrics;
237import org.apache.hadoop.hdfs.server.namenode.snapshot.Snapshot;
238import org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotManager;
239import org.apache.hadoop.hdfs.server.namenode.startupprogress.Phase;
240import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress;
241import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress.Counter;
242import org.apache.hadoop.hdfs.server.namenode.startupprogress.Status;
243import org.apache.hadoop.hdfs.server.namenode.startupprogress.Step;
244import org.apache.hadoop.hdfs.server.namenode.startupprogress.StepType;
245import org.apache.hadoop.hdfs.server.namenode.top.TopAuditLogger;
246import org.apache.hadoop.hdfs.server.namenode.top.TopConf;
247import org.apache.hadoop.hdfs.server.namenode.top.metrics.TopMetrics;
248import org.apache.hadoop.hdfs.server.namenode.top.window.RollingWindowManager;
249import org.apache.hadoop.hdfs.server.namenode.web.resources.NamenodeWebHdfsMethods;
250import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand;
251import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
252import org.apache.hadoop.hdfs.server.protocol.DatanodeStorageReport;
253import org.apache.hadoop.hdfs.server.protocol.HeartbeatResponse;
254import org.apache.hadoop.hdfs.server.protocol.NNHAStatusHeartbeat;
255import org.apache.hadoop.hdfs.server.protocol.NamenodeCommand;
256import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration;
257import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
258import org.apache.hadoop.hdfs.server.protocol.StorageReceivedDeletedBlocks;
259import org.apache.hadoop.hdfs.server.protocol.StorageReport;
260import org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary;
261import org.apache.hadoop.io.EnumSetWritable;
262import org.apache.hadoop.io.IOUtils;
263import org.apache.hadoop.io.Text;
264import org.apache.hadoop.ipc.RetriableException;
265import org.apache.hadoop.ipc.RetryCache;
266import org.apache.hadoop.ipc.Server;
267import org.apache.hadoop.ipc.StandbyException;
268import org.apache.hadoop.metrics2.annotation.Metric;
269import org.apache.hadoop.metrics2.annotation.Metrics;
270import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
271import org.apache.hadoop.metrics2.lib.MetricsRegistry;
272import org.apache.hadoop.metrics2.lib.MutableRatesWithAggregation;
273import org.apache.hadoop.metrics2.util.MBeans;
274import org.apache.hadoop.net.NetworkTopology;
275import org.apache.hadoop.net.Node;
276import org.apache.hadoop.net.NodeBase;
277import org.apache.hadoop.security.AccessControlException;
278import org.apache.hadoop.security.UserGroupInformation;
279import org.apache.hadoop.security.UserGroupInformation.AuthenticationMethod;
280import org.apache.hadoop.security.token.SecretManager.InvalidToken;
281import org.apache.hadoop.security.token.Token;
282import org.apache.hadoop.security.token.TokenIdentifier;
283import org.apache.hadoop.security.token.delegation.DelegationKey;
284import org.apache.hadoop.util.ChunkedArrayList;
285import org.apache.hadoop.util.Daemon;
286import org.apache.hadoop.util.DataChecksum;
287import org.apache.hadoop.util.ReflectionUtils;
288import org.apache.hadoop.util.StringUtils;
289import org.apache.hadoop.util.VersionInfo;
290import org.apache.log4j.Appender;
291import org.apache.log4j.AsyncAppender;
292import org.apache.log4j.Logger;
293import org.codehaus.jackson.map.ObjectMapper;
294import org.mortbay.util.ajax.JSON;
295
296import com.google.common.annotations.VisibleForTesting;
297import com.google.common.base.Charsets;
298import com.google.common.base.Preconditions;
299import com.google.common.collect.ImmutableMap;
300import com.google.common.collect.Lists;
301
302/***************************************************
303 * FSNamesystem does the actual bookkeeping work for the
304 * DataNode.
305 *
306 * It tracks several important tables.
307 *
308 * 1)  valid fsname --> blocklist  (kept on disk, logged)
309 * 2)  Set of all valid blocks (inverted #1)
310 * 3)  block --> machinelist (kept in memory, rebuilt dynamically from reports)
311 * 4)  machine --> blocklist (inverted #2)
312 * 5)  LRU cache of updated-heartbeat machines
313 ***************************************************/
314@InterfaceAudience.Private
315@Metrics(context="dfs")
316public class FSNamesystem implements Namesystem, FSNamesystemMBean,
317  NameNodeMXBean {
318  public static final Log LOG = LogFactory.getLog(FSNamesystem.class);
319  private final MetricsRegistry registry = new MetricsRegistry("FSNamesystem");
320  @Metric final MutableRatesWithAggregation detailedLockHoldTimeMetrics =
321      registry.newRatesWithAggregation("detailedLockHoldTimeMetrics");
322
323  private static final ThreadLocal<StringBuilder> auditBuffer =
324    new ThreadLocal<StringBuilder>() {
325      @Override
326      protected StringBuilder initialValue() {
327        return new StringBuilder();
328      }
329  };
330
331  private final BlockIdManager blockIdManager;
332
333  @VisibleForTesting
334  public boolean isAuditEnabled() {
335    return !isDefaultAuditLogger || auditLog.isInfoEnabled();
336  }
337
338  private void logAuditEvent(boolean succeeded, String cmd, String src)
339      throws IOException {
340    logAuditEvent(succeeded, cmd, src, null, null);
341  }
342  
343  private void logAuditEvent(boolean succeeded, String cmd, String src,
344      String dst, HdfsFileStatus stat) throws IOException {
345    if (isAuditEnabled() && isExternalInvocation()) {
346      logAuditEvent(succeeded, getRemoteUser(), getRemoteIp(),
347                    cmd, src, dst, stat);
348    }
349  }
350
351  private void logAuditEvent(boolean succeeded,
352      UserGroupInformation ugi, InetAddress addr, String cmd, String src,
353      String dst, HdfsFileStatus stat) {
354    FileStatus status = null;
355    if (stat != null) {
356      Path symlink = stat.isSymlink() ? new Path(stat.getSymlink()) : null;
357      Path path = dst != null ? new Path(dst) : new Path(src);
358      status = new FileStatus(stat.getLen(), stat.isDir(),
359          stat.getReplication(), stat.getBlockSize(), stat.getModificationTime(),
360          stat.getAccessTime(), stat.getPermission(), stat.getOwner(),
361          stat.getGroup(), symlink, path);
362    }
363    for (AuditLogger logger : auditLoggers) {
364      if (logger instanceof HdfsAuditLogger) {
365        HdfsAuditLogger hdfsLogger = (HdfsAuditLogger) logger;
366        hdfsLogger.logAuditEvent(succeeded, ugi.toString(), addr, cmd, src, dst,
367            status, ugi, dtSecretManager);
368      } else {
369        logger.logAuditEvent(succeeded, ugi.toString(), addr,
370            cmd, src, dst, status);
371      }
372    }
373  }
374
375  /**
376   * Logger for audit events, noting successful FSNamesystem operations. Emits
377   * to FSNamesystem.audit at INFO. Each event causes a set of tab-separated
378   * <code>key=value</code> pairs to be written for the following properties:
379   * <code>
380   * ugi=&lt;ugi in RPC&gt;
381   * ip=&lt;remote IP&gt;
382   * cmd=&lt;command&gt;
383   * src=&lt;src path&gt;
384   * dst=&lt;dst path (optional)&gt;
385   * perm=&lt;permissions (optional)&gt;
386   * </code>
387   */
388  public static final Log auditLog = LogFactory.getLog(
389      FSNamesystem.class.getName() + ".audit");
390
391  static final int DEFAULT_MAX_CORRUPT_FILEBLOCKS_RETURNED = 100;
392  static int BLOCK_DELETION_INCREMENT = 1000;
393  private final boolean isPermissionEnabled;
394  private final UserGroupInformation fsOwner;
395  private final String supergroup;
396  private final boolean standbyShouldCheckpoint;
397  
398  // Scan interval is not configurable.
399  private static final long DELEGATION_TOKEN_REMOVER_SCAN_INTERVAL =
400    TimeUnit.MILLISECONDS.convert(1, TimeUnit.HOURS);
401  final DelegationTokenSecretManager dtSecretManager;
402  private final boolean alwaysUseDelegationTokensForTests;
403
404  private static final Step STEP_AWAITING_REPORTED_BLOCKS =
405    new Step(StepType.AWAITING_REPORTED_BLOCKS);
406
407  // Tracks whether the default audit logger is the only configured audit
408  // logger; this allows isAuditEnabled() to return false in case the
409  // underlying logger is disabled, and avoid some unnecessary work.
410  private final boolean isDefaultAuditLogger;
411  private final List<AuditLogger> auditLoggers;
412
413  /** The namespace tree. */
414  FSDirectory dir;
415  private final BlockManager blockManager;
416  private final SnapshotManager snapshotManager;
417  private final CacheManager cacheManager;
418  private final DatanodeStatistics datanodeStatistics;
419
420  private String nameserviceId;
421
422  private volatile RollingUpgradeInfo rollingUpgradeInfo = null;
423  /**
424   * A flag that indicates whether the checkpointer should checkpoint a rollback
425   * fsimage. The edit log tailer sets this flag. The checkpoint will create a
426   * rollback fsimage if the flag is true, and then change the flag to false.
427   */
428  private volatile boolean needRollbackFsImage;
429
430  // Block pool ID used by this namenode
431  private String blockPoolId;
432
433  final LeaseManager leaseManager = new LeaseManager(this); 
434
435  volatile Daemon smmthread = null;  // SafeModeMonitor thread
436  
437  Daemon nnrmthread = null; // NamenodeResourceMonitor thread
438
439  Daemon nnEditLogRoller = null; // NameNodeEditLogRoller thread
440
441  // A daemon to periodically clean up corrupt lazyPersist files
442  // from the name space.
443  Daemon lazyPersistFileScrubber = null;
444  /**
445   * When an active namenode will roll its own edit log, in # edits
446   */
447  private final long editLogRollerThreshold;
448  /**
449   * Check interval of an active namenode's edit log roller thread 
450   */
451  private final int editLogRollerInterval;
452
453  /**
454   * How frequently we scan and unlink corrupt lazyPersist files.
455   * (In seconds)
456   */
457  private final int lazyPersistFileScrubIntervalSec;
458
459  private volatile boolean hasResourcesAvailable = false;
460  private volatile boolean fsRunning = true;
461  
462  /** The start time of the namesystem. */
463  private final long startTime = now();
464
465  /** The interval of namenode checking for the disk space availability */
466  private final long resourceRecheckInterval;
467
468  // The actual resource checker instance.
469  NameNodeResourceChecker nnResourceChecker;
470
471  private final FsServerDefaults serverDefaults;
472  private final boolean supportAppends;
473  private final ReplaceDatanodeOnFailure dtpReplaceDatanodeOnFailure;
474
475  private volatile SafeModeInfo safeMode;  // safe mode information
476
477  private final long maxFsObjects;          // maximum number of fs objects
478
479  private final long minBlockSize;         // minimum block size
480  private final long maxBlocksPerFile;     // maximum # of blocks per file
481
482  // precision of access times.
483  private final long accessTimePrecision;
484
485  /** Lock to protect FSNamesystem. */
486  private final FSNamesystemLock fsLock;
487
488  /** 
489   * Checkpoint lock to protect FSNamesystem modification on standby NNs.
490   * Unlike fsLock, it does not affect block updates. On active NNs, this lock
491   * does not provide proper protection, because there are operations that
492   * modify both block and name system state.  Even on standby, fsLock is 
493   * used when block state changes need to be blocked.
494   */
495  private final ReentrantLock cpLock;
496
497  /**
498   * Used when this NN is in standby state to read from the shared edit log.
499   */
500  private EditLogTailer editLogTailer = null;
501
502  /**
503   * Used when this NN is in standby state to perform checkpoints.
504   */
505  private StandbyCheckpointer standbyCheckpointer;
506
507  /**
508   * Reference to the NN's HAContext object. This is only set once
509   * {@link #startCommonServices(Configuration, HAContext)} is called. 
510   */
511  private HAContext haContext;
512
513  private final boolean haEnabled;
514
515  /** flag indicating whether replication queues have been initialized */
516  boolean initializedReplQueues = false;
517
518  /**
519   * Whether the namenode is in the middle of starting the active service
520   */
521  private volatile boolean startingActiveService = false;
522
523  private final RetryCache retryCache;
524
525  private KeyProviderCryptoExtension provider = null;
526
527  private volatile boolean imageLoaded = false;
528  private final Condition cond;
529
530  private final FSImage fsImage;
531
532  private final TopConf topConf;
533  private TopMetrics topMetrics;
534
535  private INodeAttributeProvider inodeAttributeProvider;
536
537  /**
538   * Notify that loading of this FSDirectory is complete, and
539   * it is imageLoaded for use
540   */
541  void imageLoadComplete() {
542    Preconditions.checkState(!imageLoaded, "FSDirectory already loaded");
543    setImageLoaded();
544  }
545
546  void setImageLoaded() {
547    if(imageLoaded) return;
548    writeLock();
549    try {
550      setImageLoaded(true);
551      dir.markNameCacheInitialized();
552      cond.signalAll();
553    } finally {
554      writeUnlock("setImageLoaded");
555    }
556  }
557
558  //This is for testing purposes only
559  @VisibleForTesting
560  boolean isImageLoaded() {
561    return imageLoaded;
562  }
563
564  // exposed for unit tests
565  protected void setImageLoaded(boolean flag) {
566    imageLoaded = flag;
567  }
568
569  /**
570   * Block until the object is imageLoaded to be used.
571   */
572  void waitForLoadingFSImage() {
573    if (!imageLoaded) {
574      writeLock();
575      try {
576        while (!imageLoaded) {
577          try {
578            cond.await(5000, TimeUnit.MILLISECONDS);
579          } catch (InterruptedException ignored) {
580          }
581        }
582      } finally {
583        writeUnlock();
584      }
585    }
586  }
587
588  /**
589   * Clear all loaded data
590   */
591  void clear() {
592    dir.reset();
593    dtSecretManager.reset();
594    blockIdManager.clear();
595    leaseManager.removeAllLeases();
596    snapshotManager.clearSnapshottableDirs();
597    cacheManager.clear();
598    setImageLoaded(false);
599    blockManager.clear();
600  }
601
602  @VisibleForTesting
603  LeaseManager getLeaseManager() {
604    return leaseManager;
605  }
606  
607  boolean isHaEnabled() {
608    return haEnabled;
609  }
610  
611  /**
612   * Check the supplied configuration for correctness.
613   * @param conf Supplies the configuration to validate.
614   * @throws IOException if the configuration could not be queried.
615   * @throws IllegalArgumentException if the configuration is invalid.
616   */
617  private static void checkConfiguration(Configuration conf)
618      throws IOException {
619
620    final Collection<URI> namespaceDirs =
621        FSNamesystem.getNamespaceDirs(conf);
622    final Collection<URI> editsDirs =
623        FSNamesystem.getNamespaceEditsDirs(conf);
624    final Collection<URI> requiredEditsDirs =
625        FSNamesystem.getRequiredNamespaceEditsDirs(conf);
626    final Collection<URI> sharedEditsDirs =
627        FSNamesystem.getSharedEditsDirs(conf);
628
629    for (URI u : requiredEditsDirs) {
630      if (u.toString().compareTo(
631              DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_DEFAULT) == 0) {
632        continue;
633      }
634
635      // Each required directory must also be in editsDirs or in
636      // sharedEditsDirs.
637      if (!editsDirs.contains(u) &&
638          !sharedEditsDirs.contains(u)) {
639        throw new IllegalArgumentException(
640            "Required edits directory " + u.toString() + " not present in " +
641            DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY + ". " +
642            DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY + "=" +
643            editsDirs.toString() + "; " +
644            DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY + "=" +
645            requiredEditsDirs.toString() + ". " +
646            DFSConfigKeys.DFS_NAMENODE_SHARED_EDITS_DIR_KEY + "=" +
647            sharedEditsDirs.toString() + ".");
648      }
649    }
650
651    if (namespaceDirs.size() == 1) {
652      LOG.warn("Only one image storage directory ("
653          + DFS_NAMENODE_NAME_DIR_KEY + ") configured. Beware of data loss"
654          + " due to lack of redundant storage directories!");
655    }
656    if (editsDirs.size() == 1) {
657      LOG.warn("Only one namespace edits storage directory ("
658          + DFS_NAMENODE_EDITS_DIR_KEY + ") configured. Beware of data loss"
659          + " due to lack of redundant storage directories!");
660    }
661  }
662
663  /**
664   * Instantiates an FSNamesystem loaded from the image and edits
665   * directories specified in the passed Configuration.
666   *
667   * @param conf the Configuration which specifies the storage directories
668   *             from which to load
669   * @return an FSNamesystem which contains the loaded namespace
670   * @throws IOException if loading fails
671   */
672  static FSNamesystem loadFromDisk(Configuration conf) throws IOException {
673
674    checkConfiguration(conf);
675    FSImage fsImage = new FSImage(conf,
676        FSNamesystem.getNamespaceDirs(conf),
677        FSNamesystem.getNamespaceEditsDirs(conf));
678    FSNamesystem namesystem = new FSNamesystem(conf, fsImage, false);
679    StartupOption startOpt = NameNode.getStartupOption(conf);
680    if (startOpt == StartupOption.RECOVER) {
681      namesystem.setSafeMode(SafeModeAction.SAFEMODE_ENTER);
682    }
683
684    long loadStart = monotonicNow();
685    try {
686      namesystem.loadFSImage(startOpt);
687    } catch (IOException ioe) {
688      LOG.warn("Encountered exception loading fsimage", ioe);
689      fsImage.close();
690      throw ioe;
691    }
692    long timeTakenToLoadFSImage = monotonicNow() - loadStart;
693    LOG.info("Finished loading FSImage in " + timeTakenToLoadFSImage + " msecs");
694    NameNodeMetrics nnMetrics = NameNode.getNameNodeMetrics();
695    if (nnMetrics != null) {
696      nnMetrics.setFsImageLoadTime((int) timeTakenToLoadFSImage);
697    }
698    return namesystem;
699  }
700  
701  FSNamesystem(Configuration conf, FSImage fsImage) throws IOException {
702    this(conf, fsImage, false);
703  }
704  
705  /**
706   * Create an FSNamesystem associated with the specified image.
707   * 
708   * Note that this does not load any data off of disk -- if you would
709   * like that behavior, use {@link #loadFromDisk(Configuration)}
710   *
711   * @param conf configuration
712   * @param fsImage The FSImage to associate with
713   * @param ignoreRetryCache Whether or not should ignore the retry cache setup
714   *                         step. For Secondary NN this should be set to true.
715   * @throws IOException on bad configuration
716   */
717  FSNamesystem(Configuration conf, FSImage fsImage, boolean ignoreRetryCache)
718      throws IOException {
719    provider = DFSUtil.createKeyProviderCryptoExtension(conf);
720    if (provider == null) {
721      LOG.info("No KeyProvider found.");
722    } else {
723      LOG.info("Found KeyProvider: " + provider.toString());
724    }
725    if (conf.getBoolean(DFS_NAMENODE_AUDIT_LOG_ASYNC_KEY,
726                        DFS_NAMENODE_AUDIT_LOG_ASYNC_DEFAULT)) {
727      LOG.info("Enabling async auditlog");
728      enableAsyncAuditLog();
729    }
730    fsLock = new FSNamesystemLock(conf, detailedLockHoldTimeMetrics);
731    cond = fsLock.newWriteLockCondition();
732    cpLock = new ReentrantLock();
733
734    this.fsImage = fsImage;
735    try {
736      resourceRecheckInterval = conf.getLong(
737          DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_KEY,
738          DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_DEFAULT);
739
740      this.blockManager = new BlockManager(this, conf);
741      this.datanodeStatistics = blockManager.getDatanodeManager().getDatanodeStatistics();
742      this.blockIdManager = new BlockIdManager(blockManager);
743
744      this.fsOwner = UserGroupInformation.getCurrentUser();
745      this.supergroup = conf.get(DFS_PERMISSIONS_SUPERUSERGROUP_KEY, 
746                                 DFS_PERMISSIONS_SUPERUSERGROUP_DEFAULT);
747      this.isPermissionEnabled = conf.getBoolean(DFS_PERMISSIONS_ENABLED_KEY,
748                                                 DFS_PERMISSIONS_ENABLED_DEFAULT);
749      LOG.info("fsOwner             = " + fsOwner);
750      LOG.info("supergroup          = " + supergroup);
751      LOG.info("isPermissionEnabled = " + isPermissionEnabled);
752
753      // block allocation has to be persisted in HA using a shared edits directory
754      // so that the standby has up-to-date namespace information
755      nameserviceId = DFSUtil.getNamenodeNameServiceId(conf);
756      this.haEnabled = HAUtil.isHAEnabled(conf, nameserviceId);  
757      
758      // Sanity check the HA-related config.
759      if (nameserviceId != null) {
760        LOG.info("Determined nameservice ID: " + nameserviceId);
761      }
762      LOG.info("HA Enabled: " + haEnabled);
763      if (!haEnabled && HAUtil.usesSharedEditsDir(conf)) {
764        LOG.warn("Configured NNs:\n" + DFSUtil.nnAddressesAsString(conf));
765        throw new IOException("Invalid configuration: a shared edits dir " +
766            "must not be specified if HA is not enabled.");
767      }
768
769      // Get the checksum type from config
770      String checksumTypeStr = conf.get(DFS_CHECKSUM_TYPE_KEY, DFS_CHECKSUM_TYPE_DEFAULT);
771      DataChecksum.Type checksumType;
772      try {
773         checksumType = DataChecksum.Type.valueOf(checksumTypeStr);
774      } catch (IllegalArgumentException iae) {
775         throw new IOException("Invalid checksum type in "
776            + DFS_CHECKSUM_TYPE_KEY + ": " + checksumTypeStr);
777      }
778
779      this.serverDefaults = new FsServerDefaults(
780          conf.getLongBytes(DFS_BLOCK_SIZE_KEY, DFS_BLOCK_SIZE_DEFAULT),
781          conf.getInt(DFS_BYTES_PER_CHECKSUM_KEY, DFS_BYTES_PER_CHECKSUM_DEFAULT),
782          conf.getInt(DFS_CLIENT_WRITE_PACKET_SIZE_KEY, DFS_CLIENT_WRITE_PACKET_SIZE_DEFAULT),
783          (short) conf.getInt(DFS_REPLICATION_KEY, DFS_REPLICATION_DEFAULT),
784          conf.getInt(IO_FILE_BUFFER_SIZE_KEY, IO_FILE_BUFFER_SIZE_DEFAULT),
785          conf.getBoolean(DFS_ENCRYPT_DATA_TRANSFER_KEY, DFS_ENCRYPT_DATA_TRANSFER_DEFAULT),
786          conf.getLong(FS_TRASH_INTERVAL_KEY, FS_TRASH_INTERVAL_DEFAULT),
787          checksumType);
788      
789      this.maxFsObjects = conf.getLong(DFS_NAMENODE_MAX_OBJECTS_KEY, 
790                                       DFS_NAMENODE_MAX_OBJECTS_DEFAULT);
791
792      this.minBlockSize = conf.getLong(DFSConfigKeys.DFS_NAMENODE_MIN_BLOCK_SIZE_KEY,
793          DFSConfigKeys.DFS_NAMENODE_MIN_BLOCK_SIZE_DEFAULT);
794      this.maxBlocksPerFile = conf.getLong(DFSConfigKeys.DFS_NAMENODE_MAX_BLOCKS_PER_FILE_KEY,
795          DFSConfigKeys.DFS_NAMENODE_MAX_BLOCKS_PER_FILE_DEFAULT);
796      this.accessTimePrecision = conf.getLong(DFS_NAMENODE_ACCESSTIME_PRECISION_KEY,
797          DFS_NAMENODE_ACCESSTIME_PRECISION_DEFAULT);
798      this.supportAppends = conf.getBoolean(DFS_SUPPORT_APPEND_KEY, DFS_SUPPORT_APPEND_DEFAULT);
799      LOG.info("Append Enabled: " + supportAppends);
800
801      this.dtpReplaceDatanodeOnFailure = ReplaceDatanodeOnFailure.get(conf);
802      
803      this.standbyShouldCheckpoint = conf.getBoolean(
804          DFS_HA_STANDBY_CHECKPOINTS_KEY, DFS_HA_STANDBY_CHECKPOINTS_DEFAULT);
805      // # edit autoroll threshold is a multiple of the checkpoint threshold 
806      this.editLogRollerThreshold = (long)
807          (conf.getFloat(
808              DFS_NAMENODE_EDIT_LOG_AUTOROLL_MULTIPLIER_THRESHOLD,
809              DFS_NAMENODE_EDIT_LOG_AUTOROLL_MULTIPLIER_THRESHOLD_DEFAULT) *
810          conf.getLong(
811              DFS_NAMENODE_CHECKPOINT_TXNS_KEY,
812              DFS_NAMENODE_CHECKPOINT_TXNS_DEFAULT));
813      this.editLogRollerInterval = conf.getInt(
814          DFS_NAMENODE_EDIT_LOG_AUTOROLL_CHECK_INTERVAL_MS,
815          DFS_NAMENODE_EDIT_LOG_AUTOROLL_CHECK_INTERVAL_MS_DEFAULT);
816
817      this.lazyPersistFileScrubIntervalSec = conf.getInt(
818          DFS_NAMENODE_LAZY_PERSIST_FILE_SCRUB_INTERVAL_SEC,
819          DFS_NAMENODE_LAZY_PERSIST_FILE_SCRUB_INTERVAL_SEC_DEFAULT);
820
821      if (this.lazyPersistFileScrubIntervalSec == 0) {
822        throw new IllegalArgumentException(
823            DFS_NAMENODE_LAZY_PERSIST_FILE_SCRUB_INTERVAL_SEC + " must be non-zero.");
824      }
825
826      // For testing purposes, allow the DT secret manager to be started regardless
827      // of whether security is enabled.
828      alwaysUseDelegationTokensForTests = conf.getBoolean(
829          DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY,
830          DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_DEFAULT);
831      
832      this.dtSecretManager = createDelegationTokenSecretManager(conf);
833      this.dir = new FSDirectory(this, conf);
834      this.snapshotManager = new SnapshotManager(dir);
835      this.cacheManager = new CacheManager(this, conf, blockManager);
836      this.safeMode = new SafeModeInfo(conf);
837      this.topConf = new TopConf(conf);
838      this.auditLoggers = initAuditLoggers(conf);
839      this.isDefaultAuditLogger = auditLoggers.size() == 1 &&
840        auditLoggers.get(0) instanceof DefaultAuditLogger;
841      this.retryCache = ignoreRetryCache ? null : initRetryCache(conf);
842      Class<? extends INodeAttributeProvider> klass = conf.getClass(
843          DFS_NAMENODE_INODE_ATTRIBUTES_PROVIDER_KEY,
844          null, INodeAttributeProvider.class);
845      if (klass != null) {
846        inodeAttributeProvider = ReflectionUtils.newInstance(klass, conf);
847        LOG.info("Using INode attribute provider: " + klass.getName());
848      }
849    } catch(IOException e) {
850      LOG.error(getClass().getSimpleName() + " initialization failed.", e);
851      close();
852      throw e;
853    } catch (RuntimeException re) {
854      LOG.error(getClass().getSimpleName() + " initialization failed.", re);
855      close();
856      throw re;
857    }
858  }
859
860  @VisibleForTesting
861  public List<AuditLogger> getAuditLoggers() {
862    return auditLoggers;
863  }
864
865  @VisibleForTesting
866  public RetryCache getRetryCache() {
867    return retryCache;
868  }
869
870  void lockRetryCache() {
871    if (retryCache != null) {
872      retryCache.lock();
873    }
874  }
875
876  void unlockRetryCache() {
877    if (retryCache != null) {
878      retryCache.unlock();
879    }
880  }
881
882  /** Whether or not retry cache is enabled */
883  boolean hasRetryCache() {
884    return retryCache != null;
885  }
886  
887  void addCacheEntryWithPayload(byte[] clientId, int callId, Object payload) {
888    if (retryCache != null) {
889      retryCache.addCacheEntryWithPayload(clientId, callId, payload);
890    }
891  }
892  
893  void addCacheEntry(byte[] clientId, int callId) {
894    if (retryCache != null) {
895      retryCache.addCacheEntry(clientId, callId);
896    }
897  }
898
899  @VisibleForTesting
900  public KeyProviderCryptoExtension getProvider() {
901    return provider;
902  }
903
904  @VisibleForTesting
905  static RetryCache initRetryCache(Configuration conf) {
906    boolean enable = conf.getBoolean(DFS_NAMENODE_ENABLE_RETRY_CACHE_KEY,
907                                     DFS_NAMENODE_ENABLE_RETRY_CACHE_DEFAULT);
908    LOG.info("Retry cache on namenode is " + (enable ? "enabled" : "disabled"));
909    if (enable) {
910      float heapPercent = conf.getFloat(
911          DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_KEY,
912          DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_DEFAULT);
913      long entryExpiryMillis = conf.getLong(
914          DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_KEY,
915          DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_DEFAULT);
916      LOG.info("Retry cache will use " + heapPercent
917          + " of total heap and retry cache entry expiry time is "
918          + entryExpiryMillis + " millis");
919      long entryExpiryNanos = entryExpiryMillis * 1000 * 1000;
920      return new RetryCache("NameNodeRetryCache", heapPercent,
921          entryExpiryNanos);
922    }
923    return null;
924  }
925
926  private List<AuditLogger> initAuditLoggers(Configuration conf) {
927    // Initialize the custom access loggers if configured.
928    Collection<String> alClasses = conf.getStringCollection(DFS_NAMENODE_AUDIT_LOGGERS_KEY);
929    List<AuditLogger> auditLoggers = Lists.newArrayList();
930    if (alClasses != null && !alClasses.isEmpty()) {
931      for (String className : alClasses) {
932        try {
933          AuditLogger logger;
934          if (DFS_NAMENODE_DEFAULT_AUDIT_LOGGER_NAME.equals(className)) {
935            logger = new DefaultAuditLogger();
936          } else {
937            logger = (AuditLogger) Class.forName(className).newInstance();
938          }
939          logger.initialize(conf);
940          auditLoggers.add(logger);
941        } catch (RuntimeException re) {
942          throw re;
943        } catch (Exception e) {
944          throw new RuntimeException(e);
945        }
946      }
947    }
948
949    // Make sure there is at least one logger installed.
950    if (auditLoggers.isEmpty()) {
951      auditLoggers.add(new DefaultAuditLogger());
952    }
953
954    // Add audit logger to calculate top users
955    if (topConf.isEnabled) {
956      topMetrics = new TopMetrics(conf, topConf.nntopReportingPeriodsMs);
957      if (DefaultMetricsSystem.instance().getSource(
958          TOPMETRICS_METRICS_SOURCE_NAME) == null) {
959        DefaultMetricsSystem.instance().register(TOPMETRICS_METRICS_SOURCE_NAME,
960            "Top N operations by user", topMetrics);
961      }
962      auditLoggers.add(new TopAuditLogger(topMetrics));
963    }
964
965    return Collections.unmodifiableList(auditLoggers);
966  }
967
968  private void loadFSImage(StartupOption startOpt) throws IOException {
969    final FSImage fsImage = getFSImage();
970
971    // format before starting up if requested
972    if (startOpt == StartupOption.FORMAT) {
973      
974      fsImage.format(this, fsImage.getStorage().determineClusterId());// reuse current id
975
976      startOpt = StartupOption.REGULAR;
977    }
978    boolean success = false;
979    writeLock();
980    try {
981      // We shouldn't be calling saveNamespace if we've come up in standby state.
982      MetaRecoveryContext recovery = startOpt.createRecoveryContext();
983      final boolean staleImage
984          = fsImage.recoverTransitionRead(startOpt, this, recovery);
985      if (RollingUpgradeStartupOption.ROLLBACK.matches(startOpt) ||
986          RollingUpgradeStartupOption.DOWNGRADE.matches(startOpt)) {
987        rollingUpgradeInfo = null;
988      }
989      final boolean needToSave = staleImage && !haEnabled && !isRollingUpgrade(); 
990      LOG.info("Need to save fs image? " + needToSave
991          + " (staleImage=" + staleImage + ", haEnabled=" + haEnabled
992          + ", isRollingUpgrade=" + isRollingUpgrade() + ")");
993      if (needToSave) {
994        fsImage.saveNamespace(this);
995      } else {
996        updateStorageVersionForRollingUpgrade(fsImage.getLayoutVersion(),
997            startOpt);
998        // No need to save, so mark the phase done.
999        StartupProgress prog = NameNode.getStartupProgress();
1000        prog.beginPhase(Phase.SAVING_CHECKPOINT);
1001        prog.endPhase(Phase.SAVING_CHECKPOINT);
1002      }
1003      // This will start a new log segment and write to the seen_txid file, so
1004      // we shouldn't do it when coming up in standby state
1005      if (!haEnabled || (haEnabled && startOpt == StartupOption.UPGRADE)
1006          || (haEnabled && startOpt == StartupOption.UPGRADEONLY)) {
1007        fsImage.openEditLogForWrite();
1008      }
1009      success = true;
1010    } finally {
1011      if (!success) {
1012        fsImage.close();
1013      }
1014      writeUnlock("loadFSImage");
1015    }
1016    imageLoadComplete();
1017  }
1018
1019  private void updateStorageVersionForRollingUpgrade(final long layoutVersion,
1020      StartupOption startOpt) throws IOException {
1021    boolean rollingStarted = RollingUpgradeStartupOption.STARTED
1022        .matches(startOpt) && layoutVersion > HdfsConstants
1023        .NAMENODE_LAYOUT_VERSION;
1024    boolean rollingRollback = RollingUpgradeStartupOption.ROLLBACK
1025        .matches(startOpt);
1026    if (rollingRollback || rollingStarted) {
1027      fsImage.updateStorageVersion();
1028    }
1029  }
1030
1031  private void startSecretManager() {
1032    if (dtSecretManager != null) {
1033      try {
1034        dtSecretManager.startThreads();
1035      } catch (IOException e) {
1036        // Inability to start secret manager
1037        // can't be recovered from.
1038        throw new RuntimeException(e);
1039      }
1040    }
1041  }
1042  
1043  private void startSecretManagerIfNecessary() {
1044    boolean shouldRun = shouldUseDelegationTokens() &&
1045      !isInSafeMode() && getEditLog().isOpenForWrite();
1046    boolean running = dtSecretManager.isRunning();
1047    if (shouldRun && !running) {
1048      startSecretManager();
1049    }
1050  }
1051
1052  private void stopSecretManager() {
1053    if (dtSecretManager != null) {
1054      dtSecretManager.stopThreads();
1055    }
1056  }
1057  
1058  /** 
1059   * Start services common to both active and standby states
1060   */
1061  void startCommonServices(Configuration conf, HAContext haContext) throws IOException {
1062    this.registerMBean(); // register the MBean for the FSNamesystemState
1063    writeLock();
1064    this.haContext = haContext;
1065    try {
1066      nnResourceChecker = new NameNodeResourceChecker(conf);
1067      checkAvailableResources();
1068      assert safeMode != null && !isPopulatingReplQueues();
1069      StartupProgress prog = NameNode.getStartupProgress();
1070      prog.beginPhase(Phase.SAFEMODE);
1071      prog.setTotal(Phase.SAFEMODE, STEP_AWAITING_REPORTED_BLOCKS,
1072        getCompleteBlocksTotal());
1073      setBlockTotal();
1074      blockManager.activate(conf);
1075    } finally {
1076      writeUnlock("startCommonServices");
1077    }
1078    
1079    registerMXBean();
1080    DefaultMetricsSystem.instance().register(this);
1081    if (inodeAttributeProvider != null) {
1082      inodeAttributeProvider.start();
1083      dir.setINodeAttributeProvider(inodeAttributeProvider);
1084    }
1085    snapshotManager.registerMXBean();
1086  }
1087  
1088  /** 
1089   * Stop services common to both active and standby states
1090   */
1091  void stopCommonServices() {
1092    writeLock();
1093    if (inodeAttributeProvider != null) {
1094      dir.setINodeAttributeProvider(null);
1095      inodeAttributeProvider.stop();
1096    }
1097    try {
1098      if (blockManager != null) blockManager.close();
1099    } finally {
1100      writeUnlock("stopCommonServices");
1101    }
1102    RetryCache.clear(retryCache);
1103  }
1104  
1105  /**
1106   * Start services required in active state
1107   * @throws IOException
1108   */
1109  void startActiveServices() throws IOException {
1110    startingActiveService = true;
1111    LOG.info("Starting services required for active state");
1112    writeLock();
1113    try {
1114      FSEditLog editLog = getFSImage().getEditLog();
1115      
1116      if (!editLog.isOpenForWrite()) {
1117        // During startup, we're already open for write during initialization.
1118        editLog.initJournalsForWrite();
1119        // May need to recover
1120        editLog.recoverUnclosedStreams();
1121        
1122        LOG.info("Catching up to latest edits from old active before " +
1123            "taking over writer role in edits logs");
1124        editLogTailer.catchupDuringFailover();
1125        
1126        blockManager.setPostponeBlocksFromFuture(false);
1127        blockManager.getDatanodeManager().markAllDatanodesStale();
1128        blockManager.clearQueues();
1129        blockManager.processAllPendingDNMessages();
1130
1131        // Only need to re-process the queue, If not in SafeMode.
1132        if (!isInSafeMode()) {
1133          LOG.info("Reprocessing replication and invalidation queues");
1134          initializeReplQueues();
1135        }
1136
1137        if (LOG.isDebugEnabled()) {
1138          LOG.debug("NameNode metadata after re-processing " +
1139              "replication and invalidation queues during failover:\n" +
1140              metaSaveAsString());
1141        }
1142        
1143        long nextTxId = getFSImage().getLastAppliedTxId() + 1;
1144        LOG.info("Will take over writing edit logs at txnid " + 
1145            nextTxId);
1146        editLog.setNextTxId(nextTxId);
1147
1148        getFSImage().editLog.openForWrite();
1149      }
1150
1151      // Enable quota checks.
1152      dir.enableQuotaChecks();
1153      if (haEnabled) {
1154        // Renew all of the leases before becoming active.
1155        // This is because, while we were in standby mode,
1156        // the leases weren't getting renewed on this NN.
1157        // Give them all a fresh start here.
1158        leaseManager.renewAllLeases();
1159      }
1160      leaseManager.startMonitor();
1161      startSecretManagerIfNecessary();
1162
1163      //ResourceMonitor required only at ActiveNN. See HDFS-2914
1164      this.nnrmthread = new Daemon(new NameNodeResourceMonitor());
1165      nnrmthread.start();
1166
1167      nnEditLogRoller = new Daemon(new NameNodeEditLogRoller(
1168          editLogRollerThreshold, editLogRollerInterval));
1169      nnEditLogRoller.start();
1170
1171      if (lazyPersistFileScrubIntervalSec > 0) {
1172        lazyPersistFileScrubber = new Daemon(new LazyPersistFileScrubber(
1173            lazyPersistFileScrubIntervalSec));
1174        lazyPersistFileScrubber.start();
1175      }
1176
1177      cacheManager.startMonitorThread();
1178      blockManager.getDatanodeManager().setShouldSendCachingCommands(true);
1179    } finally {
1180      startingActiveService = false;
1181      checkSafeMode();
1182      writeUnlock("startActiveServices");
1183    }
1184  }
1185
1186  /**
1187   * Initialize replication queues.
1188   */
1189  private void initializeReplQueues() {
1190    LOG.info("initializing replication queues");
1191    blockManager.processMisReplicatedBlocks();
1192    initializedReplQueues = true;
1193  }
1194
1195  private boolean inActiveState() {
1196    return haContext != null &&
1197        haContext.getState().getServiceState() == HAServiceState.ACTIVE;
1198  }
1199
1200  /**
1201   * @return Whether the namenode is transitioning to active state and is in the
1202   *         middle of the {@link #startActiveServices()}
1203   */
1204  public boolean inTransitionToActive() {
1205    return haEnabled && inActiveState() && startingActiveService;
1206  }
1207
1208  private boolean shouldUseDelegationTokens() {
1209    return UserGroupInformation.isSecurityEnabled() ||
1210      alwaysUseDelegationTokensForTests;
1211  }
1212
1213  /** 
1214   * Stop services required in active state
1215   */
1216  void stopActiveServices() {
1217    LOG.info("Stopping services started for active state");
1218    writeLock();
1219    try {
1220      stopSecretManager();
1221      leaseManager.stopMonitor();
1222      if (nnrmthread != null) {
1223        ((NameNodeResourceMonitor) nnrmthread.getRunnable()).stopMonitor();
1224        nnrmthread.interrupt();
1225      }
1226      if (nnEditLogRoller != null) {
1227        ((NameNodeEditLogRoller)nnEditLogRoller.getRunnable()).stop();
1228        nnEditLogRoller.interrupt();
1229      }
1230      if (lazyPersistFileScrubber != null) {
1231        ((LazyPersistFileScrubber) lazyPersistFileScrubber.getRunnable()).stop();
1232        lazyPersistFileScrubber.interrupt();
1233      }
1234      if (dir != null && getFSImage() != null) {
1235        if (getFSImage().editLog != null) {
1236          getFSImage().editLog.close();
1237        }
1238        // Update the fsimage with the last txid that we wrote
1239        // so that the tailer starts from the right spot.
1240        getFSImage().updateLastAppliedTxIdFromWritten();
1241      }
1242      if (cacheManager != null) {
1243        cacheManager.stopMonitorThread();
1244        cacheManager.clearDirectiveStats();
1245      }
1246      blockManager.getDatanodeManager().clearPendingCachingCommands();
1247      blockManager.getDatanodeManager().setShouldSendCachingCommands(false);
1248      // Don't want to keep replication queues when not in Active.
1249      blockManager.clearQueues();
1250      initializedReplQueues = false;
1251    } finally {
1252      writeUnlock("stopActiveServices");
1253    }
1254  }
1255  
1256  /**
1257   * Start services required in standby state 
1258   * 
1259   * @throws IOException
1260   */
1261  void startStandbyServices(final Configuration conf) throws IOException {
1262    LOG.info("Starting services required for standby state");
1263    if (!getFSImage().editLog.isOpenForRead()) {
1264      // During startup, we're already open for read.
1265      getFSImage().editLog.initSharedJournalsForRead();
1266    }
1267    
1268    blockManager.setPostponeBlocksFromFuture(true);
1269
1270    // Disable quota checks while in standby.
1271    dir.disableQuotaChecks();
1272    editLogTailer = new EditLogTailer(this, conf);
1273    editLogTailer.start();
1274    if (standbyShouldCheckpoint) {
1275      standbyCheckpointer = new StandbyCheckpointer(conf, this);
1276      standbyCheckpointer.start();
1277    }
1278  }
1279
1280  /**
1281   * Called when the NN is in Standby state and the editlog tailer tails the
1282   * OP_ROLLING_UPGRADE_START.
1283   */
1284  void triggerRollbackCheckpoint() {
1285    setNeedRollbackFsImage(true);
1286    if (standbyCheckpointer != null) {
1287      standbyCheckpointer.triggerRollbackCheckpoint();
1288    }
1289  }
1290
1291  /**
1292   * Called while the NN is in Standby state, but just about to be
1293   * asked to enter Active state. This cancels any checkpoints
1294   * currently being taken.
1295   */
1296  void prepareToStopStandbyServices() throws ServiceFailedException {
1297    if (standbyCheckpointer != null) {
1298      standbyCheckpointer.cancelAndPreventCheckpoints(
1299          "About to leave standby state");
1300    }
1301  }
1302
1303  /** Stop services required in standby state */
1304  void stopStandbyServices() throws IOException {
1305    LOG.info("Stopping services started for standby state");
1306    if (standbyCheckpointer != null) {
1307      standbyCheckpointer.stop();
1308    }
1309    if (editLogTailer != null) {
1310      editLogTailer.stop();
1311    }
1312    if (dir != null && getFSImage() != null && getFSImage().editLog != null) {
1313      getFSImage().editLog.close();
1314    }
1315  }
1316  
1317  @Override
1318  public void checkOperation(OperationCategory op) throws StandbyException {
1319    if (haContext != null) {
1320      // null in some unit tests
1321      haContext.checkOperation(op);
1322    }
1323  }
1324  
1325  /**
1326   * @throws RetriableException
1327   *           If 1) The NameNode is in SafeMode, 2) HA is enabled, and 3)
1328   *           NameNode is in active state
1329   * @throws SafeModeException
1330   *           Otherwise if NameNode is in SafeMode.
1331   */
1332  void checkNameNodeSafeMode(String errorMsg)
1333      throws RetriableException, SafeModeException {
1334    if (isInSafeMode()) {
1335      SafeModeException se = new SafeModeException(errorMsg, safeMode);
1336      if (haEnabled && haContext != null
1337          && haContext.getState().getServiceState() == HAServiceState.ACTIVE
1338          && shouldRetrySafeMode(this.safeMode)) {
1339        throw new RetriableException(se);
1340      } else {
1341        throw se;
1342      }
1343    }
1344  }
1345
1346  boolean isPermissionEnabled() {
1347    return isPermissionEnabled;
1348  }
1349
1350  /**
1351   * We already know that the safemode is on. We will throw a RetriableException
1352   * if the safemode is not manual or caused by low resource.
1353   */
1354  private boolean shouldRetrySafeMode(SafeModeInfo safeMode) {
1355    if (safeMode == null) {
1356      return false;
1357    } else {
1358      return !safeMode.isManual() && !safeMode.areResourcesLow();
1359    }
1360  }
1361  
1362  public static Collection<URI> getNamespaceDirs(Configuration conf) {
1363    return getStorageDirs(conf, DFS_NAMENODE_NAME_DIR_KEY);
1364  }
1365
1366  /**
1367   * Get all edits dirs which are required. If any shared edits dirs are
1368   * configured, these are also included in the set of required dirs.
1369   * 
1370   * @param conf the HDFS configuration.
1371   * @return all required dirs.
1372   */
1373  public static Collection<URI> getRequiredNamespaceEditsDirs(Configuration conf) {
1374    Set<URI> ret = new HashSet<URI>();
1375    ret.addAll(getStorageDirs(conf, DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY));
1376    ret.addAll(getSharedEditsDirs(conf));
1377    return ret;
1378  }
1379
1380  private static Collection<URI> getStorageDirs(Configuration conf,
1381                                                String propertyName) {
1382    Collection<String> dirNames = conf.getTrimmedStringCollection(propertyName);
1383    StartupOption startOpt = NameNode.getStartupOption(conf);
1384    if(startOpt == StartupOption.IMPORT) {
1385      // In case of IMPORT this will get rid of default directories 
1386      // but will retain directories specified in hdfs-site.xml
1387      // When importing image from a checkpoint, the name-node can
1388      // start with empty set of storage directories.
1389      Configuration cE = new HdfsConfiguration(false);
1390      cE.addResource("core-default.xml");
1391      cE.addResource("core-site.xml");
1392      cE.addResource("hdfs-default.xml");
1393      Collection<String> dirNames2 = cE.getTrimmedStringCollection(propertyName);
1394      dirNames.removeAll(dirNames2);
1395      if(dirNames.isEmpty())
1396        LOG.warn("!!! WARNING !!!" +
1397          "\n\tThe NameNode currently runs without persistent storage." +
1398          "\n\tAny changes to the file system meta-data may be lost." +
1399          "\n\tRecommended actions:" +
1400          "\n\t\t- shutdown and restart NameNode with configured \"" 
1401          + propertyName + "\" in hdfs-site.xml;" +
1402          "\n\t\t- use Backup Node as a persistent and up-to-date storage " +
1403          "of the file system meta-data.");
1404    } else if (dirNames.isEmpty()) {
1405      dirNames = Collections.singletonList(
1406          DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_DEFAULT);
1407    }
1408    return Util.stringCollectionAsURIs(dirNames);
1409  }
1410
1411  /**
1412   * Return an ordered list of edits directories to write to.
1413   * The list is ordered such that all shared edits directories
1414   * are ordered before non-shared directories, and any duplicates
1415   * are removed. The order they are specified in the configuration
1416   * is retained.
1417   * @return Collection of shared edits directories.
1418   * @throws IOException if multiple shared edits directories are configured
1419   */
1420  public static List<URI> getNamespaceEditsDirs(Configuration conf)
1421      throws IOException {
1422    return getNamespaceEditsDirs(conf, true);
1423  }
1424  
1425  public static List<URI> getNamespaceEditsDirs(Configuration conf,
1426      boolean includeShared)
1427      throws IOException {
1428    // Use a LinkedHashSet so that order is maintained while we de-dup
1429    // the entries.
1430    LinkedHashSet<URI> editsDirs = new LinkedHashSet<URI>();
1431    
1432    if (includeShared) {
1433      List<URI> sharedDirs = getSharedEditsDirs(conf);
1434  
1435      // Fail until multiple shared edits directories are supported (HDFS-2782)
1436      if (sharedDirs.size() > 1) {
1437        throw new IOException(
1438            "Multiple shared edits directories are not yet supported");
1439      }
1440  
1441      // First add the shared edits dirs. It's critical that the shared dirs
1442      // are added first, since JournalSet syncs them in the order they are listed,
1443      // and we need to make sure all edits are in place in the shared storage
1444      // before they are replicated locally. See HDFS-2874.
1445      for (URI dir : sharedDirs) {
1446        if (!editsDirs.add(dir)) {
1447          LOG.warn("Edits URI " + dir + " listed multiple times in " + 
1448              DFS_NAMENODE_SHARED_EDITS_DIR_KEY + ". Ignoring duplicates.");
1449        }
1450      }
1451    }    
1452    // Now add the non-shared dirs.
1453    for (URI dir : getStorageDirs(conf, DFS_NAMENODE_EDITS_DIR_KEY)) {
1454      if (!editsDirs.add(dir)) {
1455        LOG.warn("Edits URI " + dir + " listed multiple times in " + 
1456            DFS_NAMENODE_SHARED_EDITS_DIR_KEY + " and " +
1457            DFS_NAMENODE_EDITS_DIR_KEY + ". Ignoring duplicates.");
1458      }
1459    }
1460
1461    if (editsDirs.isEmpty()) {
1462      // If this is the case, no edit dirs have been explicitly configured.
1463      // Image dirs are to be used for edits too.
1464      return Lists.newArrayList(getNamespaceDirs(conf));
1465    } else {
1466      return Lists.newArrayList(editsDirs);
1467    }
1468  }
1469  
1470  /**
1471   * Returns edit directories that are shared between primary and secondary.
1472   * @param conf configuration
1473   * @return collection of edit directories from {@code conf}
1474   */
1475  public static List<URI> getSharedEditsDirs(Configuration conf) {
1476    // don't use getStorageDirs here, because we want an empty default
1477    // rather than the dir in /tmp
1478    Collection<String> dirNames = conf.getTrimmedStringCollection(
1479        DFS_NAMENODE_SHARED_EDITS_DIR_KEY);
1480    return Util.stringCollectionAsURIs(dirNames);
1481  }
1482
1483  @Override
1484  public void readLock() {
1485    this.fsLock.readLock();
1486  }
1487  @Override
1488  public void readUnlock() {
1489    this.fsLock.readUnlock();
1490  }
1491  public void readUnlock(String opName) {
1492    this.fsLock.readUnlock(opName);
1493  }
1494  @Override
1495  public void writeLock() {
1496    this.fsLock.writeLock();
1497  }
1498  @Override
1499  public void writeLockInterruptibly() throws InterruptedException {
1500    this.fsLock.writeLockInterruptibly();
1501  }
1502  @Override
1503  public void writeUnlock() {
1504    this.fsLock.writeUnlock();
1505  }
1506  public void writeUnlock(String opName) {
1507    this.fsLock.writeUnlock(opName);
1508  }
1509  @Override
1510  public boolean hasWriteLock() {
1511    return this.fsLock.isWriteLockedByCurrentThread();
1512  }
1513  @Override
1514  public boolean hasReadLock() {
1515    return this.fsLock.getReadHoldCount() > 0 || hasWriteLock();
1516  }
1517
1518  public int getReadHoldCount() {
1519    return this.fsLock.getReadHoldCount();
1520  }
1521
1522  public int getWriteHoldCount() {
1523    return this.fsLock.getWriteHoldCount();
1524  }
1525
1526  /** Lock the checkpoint lock */
1527  public void cpLock() {
1528    this.cpLock.lock();
1529  }
1530
1531  /** Lock the checkpoint lock interrupibly */
1532  public void cpLockInterruptibly() throws InterruptedException {
1533    this.cpLock.lockInterruptibly();
1534  }
1535
1536  /** Unlock the checkpoint lock */
1537  public void cpUnlock() {
1538    this.cpLock.unlock();
1539  }
1540    
1541
1542  NamespaceInfo getNamespaceInfo() {
1543    readLock();
1544    try {
1545      return unprotectedGetNamespaceInfo();
1546    } finally {
1547      readUnlock("getNamespaceInfo");
1548    }
1549  }
1550
1551  /**
1552   * Version of @see #getNamespaceInfo() that is not protected by a lock.
1553   */
1554  NamespaceInfo unprotectedGetNamespaceInfo() {
1555    return new NamespaceInfo(getFSImage().getStorage().getNamespaceID(),
1556        getClusterId(), getBlockPoolId(),
1557        getFSImage().getStorage().getCTime());
1558  }
1559
1560  /**
1561   * Close down this file system manager.
1562   * Causes heartbeat and lease daemons to stop; waits briefly for
1563   * them to finish, but a short timeout returns control back to caller.
1564   */
1565  void close() {
1566    fsRunning = false;
1567    try {
1568      stopCommonServices();
1569      if (smmthread != null) smmthread.interrupt();
1570    } finally {
1571      // using finally to ensure we also wait for lease daemon
1572      try {
1573        stopActiveServices();
1574        stopStandbyServices();
1575      } catch (IOException ie) {
1576      } finally {
1577        IOUtils.cleanup(LOG, dir);
1578        IOUtils.cleanup(LOG, fsImage);
1579      }
1580    }
1581  }
1582
1583  @Override
1584  public boolean isRunning() {
1585    return fsRunning;
1586  }
1587  
1588  @Override
1589  public boolean isInStandbyState() {
1590    if (haContext == null || haContext.getState() == null) {
1591      // We're still starting up. In this case, if HA is
1592      // on for the cluster, we always start in standby. Otherwise
1593      // start in active.
1594      return haEnabled;
1595    }
1596
1597    return HAServiceState.STANDBY == haContext.getState().getServiceState();
1598  }
1599
1600  /**
1601   * Dump all metadata into specified file
1602   */
1603  void metaSave(String filename) throws IOException {
1604    checkSuperuserPrivilege();
1605    checkOperation(OperationCategory.UNCHECKED);
1606    writeLock();
1607    try {
1608      checkOperation(OperationCategory.UNCHECKED);
1609      File file = new File(System.getProperty("hadoop.log.dir"), filename);
1610      PrintWriter out = new PrintWriter(new BufferedWriter(
1611          new OutputStreamWriter(new FileOutputStream(file), Charsets.UTF_8)));
1612      metaSave(out);
1613      out.flush();
1614      out.close();
1615    } finally {
1616      writeUnlock("metaSave");
1617    }
1618  }
1619
1620  private void metaSave(PrintWriter out) {
1621    assert hasWriteLock();
1622    long totalInodes = this.dir.totalInodes();
1623    long totalBlocks = this.getBlocksTotal();
1624    out.println(totalInodes + " files and directories, " + totalBlocks
1625        + " blocks = " + (totalInodes + totalBlocks) + " total");
1626
1627    blockManager.metaSave(out);
1628  }
1629
1630  private String metaSaveAsString() {
1631    StringWriter sw = new StringWriter();
1632    PrintWriter pw = new PrintWriter(sw);
1633    metaSave(pw);
1634    pw.flush();
1635    return sw.toString();
1636  }
1637
1638  FsServerDefaults getServerDefaults() throws StandbyException {
1639    checkOperation(OperationCategory.READ);
1640    return serverDefaults;
1641  }
1642
1643  long getAccessTimePrecision() {
1644    return accessTimePrecision;
1645  }
1646
1647  private boolean isAccessTimeSupported() {
1648    return accessTimePrecision > 0;
1649  }
1650
1651  /////////////////////////////////////////////////////////
1652  //
1653  // These methods are called by HadoopFS clients
1654  //
1655  /////////////////////////////////////////////////////////
1656  /**
1657   * Set permissions for an existing file.
1658   * @throws IOException
1659   */
1660  void setPermission(String src, FsPermission permission) throws IOException {
1661    final String operationName = "setPermission";
1662    HdfsFileStatus auditStat;
1663    checkOperation(OperationCategory.WRITE);
1664    writeLock();
1665    try {
1666      checkOperation(OperationCategory.WRITE);
1667      checkNameNodeSafeMode("Cannot set permission for " + src);
1668      auditStat = FSDirAttrOp.setPermission(dir, src, permission);
1669    } catch (AccessControlException e) {
1670      logAuditEvent(false, operationName, src);
1671      throw e;
1672    } finally {
1673      writeUnlock(operationName);
1674    }
1675    getEditLog().logSync();
1676    logAuditEvent(true, operationName, src, null, auditStat);
1677  }
1678
1679  /**
1680   * Set owner for an existing file.
1681   * @throws IOException
1682   */
1683  void setOwner(String src, String username, String group)
1684      throws IOException {
1685    final String operationName = "setOwner";
1686    HdfsFileStatus auditStat;
1687    checkOperation(OperationCategory.WRITE);
1688    writeLock();
1689    try {
1690      checkOperation(OperationCategory.WRITE);
1691      checkNameNodeSafeMode("Cannot set owner for " + src);
1692      auditStat = FSDirAttrOp.setOwner(dir, src, username, group);
1693    } catch (AccessControlException e) {
1694      logAuditEvent(false, operationName, src);
1695      throw e;
1696    } finally {
1697      writeUnlock(operationName);
1698    }
1699    getEditLog().logSync();
1700    logAuditEvent(true, operationName, src, null, auditStat);
1701  }
1702
1703  static class GetBlockLocationsResult {
1704    final boolean updateAccessTime;
1705    final LocatedBlocks blocks;
1706    boolean updateAccessTime() {
1707      return updateAccessTime;
1708    }
1709    private GetBlockLocationsResult(
1710        boolean updateAccessTime, LocatedBlocks blocks) {
1711      this.updateAccessTime = updateAccessTime;
1712      this.blocks = blocks;
1713    }
1714  }
1715
1716  /**
1717   * Get block locations within the specified range.
1718   * @see ClientProtocol#getBlockLocations(String, long, long)
1719   */
1720  LocatedBlocks getBlockLocations(String clientMachine, String srcArg,
1721      long offset, long length) throws IOException {
1722    final String operationName = "open";
1723    checkOperation(OperationCategory.READ);
1724    GetBlockLocationsResult res = null;
1725    FSPermissionChecker pc = getPermissionChecker();
1726    readLock();
1727    try {
1728      checkOperation(OperationCategory.READ);
1729      res = getBlockLocations(pc, srcArg, offset, length, true, true);
1730    } catch (AccessControlException e) {
1731      logAuditEvent(false, operationName, srcArg);
1732      throw e;
1733    } finally {
1734      readUnlock(operationName);
1735    }
1736
1737    logAuditEvent(true, operationName, srcArg);
1738
1739    if (res.updateAccessTime()) {
1740      String src = srcArg;
1741      writeLock();
1742      final long now = now();
1743      try {
1744        checkOperation(OperationCategory.WRITE);
1745        /**
1746         * Resolve the path again and update the atime only when the file
1747         * exists.
1748         *
1749         * XXX: Races can still occur even after resolving the path again.
1750         * For example:
1751         *
1752         * <ul>
1753         *   <li>Get the block location for "/a/b"</li>
1754         *   <li>Rename "/a/b" to "/c/b"</li>
1755         *   <li>The second resolution still points to "/a/b", which is
1756         *   wrong.</li>
1757         * </ul>
1758         *
1759         * The behavior is incorrect but consistent with the one before
1760         * HDFS-7463. A better fix is to change the edit log of SetTime to
1761         * use inode id instead of a path.
1762         */
1763        final INodesInPath iip = dir.resolvePath(pc, src);
1764        src = iip.getPath();
1765        INode inode = iip.getLastINode();
1766        boolean updateAccessTime = inode != null &&
1767            now > inode.getAccessTime() + getAccessTimePrecision();
1768        if (!isInSafeMode() && updateAccessTime) {
1769          boolean changed = FSDirAttrOp.setTimes(dir,
1770              inode, -1, now, false, iip.getLatestSnapshotId());
1771          if (changed) {
1772            getEditLog().logTimes(src, -1, now);
1773          }
1774        }
1775      } catch (Throwable e) {
1776        LOG.warn("Failed to update the access time of " + src, e);
1777      } finally {
1778        writeUnlock(operationName);
1779      }
1780    }
1781
1782    LocatedBlocks blocks = res.blocks;
1783    if (blocks != null) {
1784      blockManager.getDatanodeManager().sortLocatedBlocks(
1785          clientMachine, blocks.getLocatedBlocks());
1786
1787      // lastBlock is not part of getLocatedBlocks(), might need to sort it too
1788      LocatedBlock lastBlock = blocks.getLastLocatedBlock();
1789      if (lastBlock != null) {
1790        ArrayList<LocatedBlock> lastBlockList = Lists.newArrayList(lastBlock);
1791        blockManager.getDatanodeManager().sortLocatedBlocks(
1792            clientMachine, lastBlockList);
1793      }
1794    }
1795    return blocks;
1796  }
1797
1798  /**
1799   * Get block locations within the specified range.
1800   * @see ClientProtocol#getBlockLocations(String, long, long)
1801   * @throws IOException
1802   */
1803  GetBlockLocationsResult getBlockLocations(
1804      FSPermissionChecker pc, String src, long offset, long length,
1805      boolean needBlockToken, boolean checkSafeMode) throws IOException {
1806    if (offset < 0) {
1807      throw new HadoopIllegalArgumentException(
1808          "Negative offset is not supported. File: " + src);
1809    }
1810    if (length < 0) {
1811      throw new HadoopIllegalArgumentException(
1812          "Negative length is not supported. File: " + src);
1813    }
1814    final GetBlockLocationsResult ret = getBlockLocationsInt(
1815        pc, src, offset, length, needBlockToken);
1816
1817    if (checkSafeMode && isInSafeMode()) {
1818      for (LocatedBlock b : ret.blocks.getLocatedBlocks()) {
1819        // if safemode & no block locations yet then throw safemodeException
1820        if ((b.getLocations() == null) || (b.getLocations().length == 0)) {
1821          SafeModeException se = new SafeModeException(
1822              "Zero blocklocations for " + src, safeMode);
1823          if (haEnabled && haContext != null &&
1824              haContext.getState().getServiceState() == HAServiceState.ACTIVE) {
1825            throw new RetriableException(se);
1826          } else {
1827            throw se;
1828          }
1829        }
1830      }
1831    }
1832    return ret;
1833  }
1834
1835  private GetBlockLocationsResult getBlockLocationsInt(
1836      FSPermissionChecker pc, final String srcArg, long offset, long length,
1837      boolean needBlockToken)
1838      throws IOException {
1839    String src = srcArg;
1840    final INodesInPath iip = dir.resolvePath(pc, src);
1841    src = iip.getPath();
1842    final INodeFile inode = INodeFile.valueOf(iip.getLastINode(), src);
1843    if (isPermissionEnabled) {
1844      dir.checkPathAccess(pc, iip, FsAction.READ);
1845      checkUnreadableBySuperuser(pc, inode, iip.getPathSnapshotId());
1846    }
1847
1848    final long fileSize = iip.isSnapshot()
1849        ? inode.computeFileSize(iip.getPathSnapshotId())
1850        : inode.computeFileSizeNotIncludingLastUcBlock();
1851    boolean isUc = inode.isUnderConstruction();
1852    if (iip.isSnapshot()) {
1853      // if src indicates a snapshot file, we need to make sure the returned
1854      // blocks do not exceed the size of the snapshot file.
1855      length = Math.min(length, fileSize - offset);
1856      isUc = false;
1857    }
1858
1859    final FileEncryptionInfo feInfo =
1860        FSDirectory.isReservedRawName(srcArg) ? null
1861            : dir.getFileEncryptionInfo(inode, iip.getPathSnapshotId(), iip);
1862
1863    final LocatedBlocks blocks = blockManager.createLocatedBlocks(
1864        inode.getBlocks(iip.getPathSnapshotId()), fileSize,
1865        isUc, offset, length, needBlockToken, iip.isSnapshot(), feInfo);
1866
1867    // Set caching information for the located blocks.
1868    for (LocatedBlock lb : blocks.getLocatedBlocks()) {
1869      cacheManager.setCachedLocations(lb);
1870    }
1871
1872    final long now = now();
1873    boolean updateAccessTime = isAccessTimeSupported() && !isInSafeMode()
1874        && !iip.isSnapshot()
1875        && now > inode.getAccessTime() + getAccessTimePrecision();
1876    return new GetBlockLocationsResult(updateAccessTime, blocks);
1877  }
1878
1879  /**
1880   * Moves all the blocks from {@code srcs} and appends them to {@code target}
1881   * To avoid rollbacks we will verify validity of ALL of the args
1882   * before we start actual move.
1883   * 
1884   * This does not support ".inodes" relative path
1885   * @param target target to concat into
1886   * @param srcs file that will be concatenated
1887   * @throws IOException on error
1888   */
1889  void concat(String target, String [] srcs, boolean logRetryCache)
1890      throws IOException {
1891    waitForLoadingFSImage();
1892    final String operationName = "concat";
1893    HdfsFileStatus stat = null;
1894    boolean success = false;
1895    writeLock();
1896    try {
1897      checkOperation(OperationCategory.WRITE);
1898      checkNameNodeSafeMode("Cannot concat " + target);
1899      stat = FSDirConcatOp.concat(dir, target, srcs, logRetryCache);
1900      success = true;
1901    } finally {
1902      writeUnlock(operationName);
1903      if (success) {
1904        getEditLog().logSync();
1905      }
1906      logAuditEvent(success, operationName, Arrays.toString(srcs),
1907          target, stat);
1908    }
1909  }
1910
1911  /**
1912   * stores the modification and access time for this inode. 
1913   * The access time is precise up to an hour. The transaction, if needed, is
1914   * written to the edits log but is not flushed.
1915   */
1916  void setTimes(String src, long mtime, long atime) throws IOException {
1917    final String operationName = "setTimes";
1918    HdfsFileStatus auditStat;
1919    checkOperation(OperationCategory.WRITE);
1920    writeLock();
1921    try {
1922      checkOperation(OperationCategory.WRITE);
1923      checkNameNodeSafeMode("Cannot set times " + src);
1924      auditStat = FSDirAttrOp.setTimes(dir, src, mtime, atime);
1925    } catch (AccessControlException e) {
1926      logAuditEvent(false, operationName, src);
1927      throw e;
1928    } finally {
1929      writeUnlock(operationName);
1930    }
1931    getEditLog().logSync();
1932    logAuditEvent(true, operationName, src, null, auditStat);
1933  }
1934
1935  /**
1936   * Create a symbolic link.
1937   */
1938  @SuppressWarnings("deprecation")
1939  void createSymlink(String target, String link,
1940      PermissionStatus dirPerms, boolean createParent, boolean logRetryCache)
1941      throws IOException {
1942    final String operationName = "createSymlink";
1943    if (!FileSystem.areSymlinksEnabled()) {
1944      throw new UnsupportedOperationException("Symlinks not supported");
1945    }
1946    HdfsFileStatus auditStat = null;
1947    checkOperation(OperationCategory.WRITE);
1948    writeLock();
1949    try {
1950      checkOperation(OperationCategory.WRITE);
1951      checkNameNodeSafeMode("Cannot create symlink " + link);
1952      auditStat = FSDirSymlinkOp.createSymlinkInt(this, target, link, dirPerms,
1953                                                  createParent, logRetryCache);
1954    } catch (AccessControlException e) {
1955      logAuditEvent(false, operationName, link, target, null);
1956      throw e;
1957    } finally {
1958      writeUnlock(operationName);
1959    }
1960    getEditLog().logSync();
1961    logAuditEvent(true, operationName, link, target, auditStat);
1962  }
1963
1964  /**
1965   * Set replication for an existing file.
1966   * 
1967   * The NameNode sets new replication and schedules either replication of 
1968   * under-replicated data blocks or removal of the excessive block copies 
1969   * if the blocks are over-replicated.
1970   * 
1971   * @see ClientProtocol#setReplication(String, short)
1972   * @param src file name
1973   * @param replication new replication
1974   * @return true if successful; 
1975   *         false if file does not exist or is a directory
1976   */
1977  boolean setReplication(final String src, final short replication)
1978      throws IOException {
1979    final String operationName = "setReplication";
1980    boolean success = false;
1981    waitForLoadingFSImage();
1982    checkOperation(OperationCategory.WRITE);
1983    writeLock();
1984    try {
1985      checkOperation(OperationCategory.WRITE);
1986      checkNameNodeSafeMode("Cannot set replication for " + src);
1987      success = FSDirAttrOp.setReplication(dir, blockManager, src, replication);
1988    } catch (AccessControlException e) {
1989      logAuditEvent(false, operationName, src);
1990      throw e;
1991    } finally {
1992      writeUnlock(operationName);
1993    }
1994    if (success) {
1995      getEditLog().logSync();
1996      logAuditEvent(true, operationName, src);
1997    }
1998    return success;
1999  }
2000
2001  /**
2002   * Truncate file to a lower length.
2003   * Truncate cannot be reverted / recovered from as it causes data loss.
2004   * Truncation at block boundary is atomic, otherwise it requires
2005   * block recovery to truncate the last block of the file.
2006   *
2007   * @return true if client does not need to wait for block recovery,
2008   * false if client needs to wait for block recovery.
2009   */
2010  boolean truncate(String src, long newLength,
2011                   String clientName, String clientMachine,
2012                   long mtime)
2013      throws IOException, UnresolvedLinkException {
2014    boolean ret;
2015    try {
2016      ret = truncateInt(src, newLength, clientName, clientMachine, mtime);
2017    } catch (AccessControlException e) {
2018      logAuditEvent(false, "truncate", src);
2019      throw e;
2020    }
2021    return ret;
2022  }
2023
2024  boolean truncateInt(String srcArg, long newLength,
2025                      String clientName, String clientMachine,
2026                      long mtime)
2027      throws IOException, UnresolvedLinkException {
2028    final String operationName = "truncate";
2029    String src = srcArg;
2030    NameNode.stateChangeLog.debug(
2031        "DIR* NameSystem.truncate: src={} newLength={}", src, newLength);
2032    if (newLength < 0) {
2033      throw new HadoopIllegalArgumentException(
2034          "Cannot truncate to a negative file size: " + newLength + ".");
2035    }
2036    HdfsFileStatus stat = null;
2037    FSPermissionChecker pc = getPermissionChecker();
2038    checkOperation(OperationCategory.WRITE);
2039    boolean res;
2040    writeLock();
2041    BlocksMapUpdateInfo toRemoveBlocks = new BlocksMapUpdateInfo();
2042    try {
2043      checkOperation(OperationCategory.WRITE);
2044      checkNameNodeSafeMode("Cannot truncate for " + src);
2045      INodesInPath iip = dir.resolvePath(pc, src);
2046      src = iip.getPath();
2047      res = truncateInternal(src, newLength, clientName,
2048          clientMachine, mtime, pc, toRemoveBlocks);
2049      stat = dir.getAuditFileInfo(dir.getINodesInPath4Write(src, false));
2050    } finally {
2051      writeUnlock(operationName);
2052    }
2053    getEditLog().logSync();
2054    if (!toRemoveBlocks.getToDeleteList().isEmpty()) {
2055      removeBlocks(toRemoveBlocks);
2056      toRemoveBlocks.clear();
2057    }
2058    logAuditEvent(true, operationName, src, null, stat);
2059    return res;
2060  }
2061
2062  /**
2063   * Truncate a file to a given size
2064   * Update the count at each ancestor directory with quota
2065   */
2066  boolean truncateInternal(String src, long newLength,
2067                           String clientName, String clientMachine,
2068                           long mtime, FSPermissionChecker pc,
2069                           BlocksMapUpdateInfo toRemoveBlocks)
2070      throws IOException, UnresolvedLinkException {
2071    assert hasWriteLock();
2072    INodesInPath iip = dir.getINodesInPath4Write(src, true);
2073    if (isPermissionEnabled) {
2074      dir.checkPathAccess(pc, iip, FsAction.WRITE);
2075    }
2076    INodeFile file = INodeFile.valueOf(iip.getLastINode(), src);
2077    final BlockStoragePolicy lpPolicy =
2078        blockManager.getStoragePolicy("LAZY_PERSIST");
2079
2080    if (lpPolicy != null &&
2081        lpPolicy.getId() == file.getStoragePolicyID()) {
2082      throw new UnsupportedOperationException(
2083          "Cannot truncate lazy persist file " + src);
2084    }
2085
2086    // Check if the file is already being truncated with the same length
2087    final BlockInfoContiguous last = file.getLastBlock();
2088    if (last != null && last.getBlockUCState() == BlockUCState.UNDER_RECOVERY) {
2089      final Block truncateBlock
2090          = ((BlockInfoContiguousUnderConstruction)last).getTruncateBlock();
2091      if (truncateBlock != null) {
2092        final long truncateLength = file.computeFileSize(false, false)
2093            + truncateBlock.getNumBytes();
2094        if (newLength == truncateLength) {
2095          return false;
2096        }
2097      }
2098    }
2099
2100    // Opening an existing file for truncate. May need lease recovery.
2101    recoverLeaseInternal(RecoverLeaseOp.TRUNCATE_FILE,
2102        iip, src, clientName, clientMachine, false);
2103    // Truncate length check.
2104    long oldLength = file.computeFileSize();
2105    if(oldLength == newLength) {
2106      return true;
2107    }
2108    if(oldLength < newLength) {
2109      throw new HadoopIllegalArgumentException(
2110          "Cannot truncate to a larger file size. Current size: " + oldLength +
2111              ", truncate size: " + newLength + ".");
2112    }
2113    // Perform INodeFile truncation.
2114    final QuotaCounts delta = new QuotaCounts.Builder().build();
2115    boolean onBlockBoundary = dir.truncate(iip, newLength, toRemoveBlocks,
2116        mtime, delta);
2117    Block truncateBlock = null;
2118    if(!onBlockBoundary) {
2119      // Open file for write, but don't log into edits
2120      long lastBlockDelta = file.computeFileSize() - newLength;
2121      assert lastBlockDelta > 0 : "delta is 0 only if on block bounday";
2122      truncateBlock = prepareFileForTruncate(iip, clientName, clientMachine,
2123          lastBlockDelta, null);
2124    }
2125
2126    // update the quota: use the preferred block size for UC block
2127    dir.writeLock();
2128    try {
2129      dir.updateCountNoQuotaCheck(iip, iip.length() - 1, delta);
2130    } finally {
2131      dir.writeUnlock();
2132    }
2133
2134    getEditLog().logTruncate(src, clientName, clientMachine, newLength, mtime,
2135        truncateBlock);
2136    return onBlockBoundary;
2137  }
2138
2139  /**
2140   * Convert current INode to UnderConstruction.
2141   * Recreate lease.
2142   * Create new block for the truncated copy.
2143   * Schedule truncation of the replicas.
2144   *
2145   * @return the returned block will be written to editLog and passed back into
2146   * this method upon loading.
2147   */
2148  Block prepareFileForTruncate(INodesInPath iip,
2149                               String leaseHolder,
2150                               String clientMachine,
2151                               long lastBlockDelta,
2152                               Block newBlock)
2153      throws IOException {
2154    INodeFile file = iip.getLastINode().asFile();
2155    String src = iip.getPath();
2156    file.recordModification(iip.getLatestSnapshotId());
2157    file.toUnderConstruction(leaseHolder, clientMachine);
2158    assert file.isUnderConstruction() : "inode should be under construction.";
2159    leaseManager.addLease(
2160        file.getFileUnderConstructionFeature().getClientName(), src);
2161    boolean shouldRecoverNow = (newBlock == null);
2162    BlockInfoContiguous oldBlock = file.getLastBlock();
2163    boolean shouldCopyOnTruncate = shouldCopyOnTruncate(file, oldBlock);
2164    if(newBlock == null) {
2165      newBlock = (shouldCopyOnTruncate) ? createNewBlock() :
2166          new Block(oldBlock.getBlockId(), oldBlock.getNumBytes(),
2167              nextGenerationStamp(blockIdManager.isLegacyBlock(oldBlock)));
2168    }
2169
2170    BlockInfoContiguousUnderConstruction truncatedBlockUC;
2171    if(shouldCopyOnTruncate) {
2172      // Add new truncateBlock into blocksMap and
2173      // use oldBlock as a source for copy-on-truncate recovery
2174      truncatedBlockUC = new BlockInfoContiguousUnderConstruction(newBlock,
2175          file.getBlockReplication());
2176      truncatedBlockUC.setNumBytes(oldBlock.getNumBytes() - lastBlockDelta);
2177      truncatedBlockUC.setTruncateBlock(oldBlock);
2178      file.setLastBlock(truncatedBlockUC, blockManager.getStorages(oldBlock));
2179      getBlockManager().addBlockCollection(truncatedBlockUC, file);
2180
2181      NameNode.stateChangeLog.debug(
2182          "BLOCK* prepareFileForTruncate: Scheduling copy-on-truncate to new" +
2183          " size {}  new block {} old block {}", truncatedBlockUC.getNumBytes(),
2184          newBlock, truncatedBlockUC.getTruncateBlock());
2185    } else {
2186      // Use new generation stamp for in-place truncate recovery
2187      blockManager.convertLastBlockToUnderConstruction(file, lastBlockDelta);
2188      oldBlock = file.getLastBlock();
2189      assert !oldBlock.isComplete() : "oldBlock should be under construction";
2190      truncatedBlockUC = (BlockInfoContiguousUnderConstruction) oldBlock;
2191      truncatedBlockUC.setTruncateBlock(new Block(oldBlock));
2192      truncatedBlockUC.getTruncateBlock().setNumBytes(
2193          oldBlock.getNumBytes() - lastBlockDelta);
2194      truncatedBlockUC.getTruncateBlock().setGenerationStamp(
2195          newBlock.getGenerationStamp());
2196
2197      NameNode.stateChangeLog.debug(
2198          "BLOCK* prepareFileForTruncate: {} Scheduling in-place block " +
2199          "truncate to new size {}",
2200          truncatedBlockUC.getTruncateBlock().getNumBytes(), truncatedBlockUC);
2201    }
2202    if (shouldRecoverNow) {
2203      truncatedBlockUC.initializeBlockRecovery(newBlock.getGenerationStamp());
2204    }
2205
2206    return newBlock;
2207  }
2208
2209  /**
2210   * Defines if a replica needs to be copied on truncate or
2211   * can be truncated in place.
2212   */
2213  boolean shouldCopyOnTruncate(INodeFile file, BlockInfoContiguous blk) {
2214    if(!isUpgradeFinalized()) {
2215      return true;
2216    }
2217    if (isRollingUpgrade()) {
2218      return true;
2219    }
2220    return file.isBlockInLatestSnapshot(blk);
2221  }
2222
2223  /**
2224   * Set the storage policy for a file or a directory.
2225   *
2226   * @param src file/directory path
2227   * @param policyName storage policy name
2228   */
2229  void setStoragePolicy(String src, String policyName) throws IOException {
2230    HdfsFileStatus auditStat;
2231    waitForLoadingFSImage();
2232    checkOperation(OperationCategory.WRITE);
2233    final String operationName = "setStoragePolicy";
2234    writeLock();
2235    try {
2236      checkOperation(OperationCategory.WRITE);
2237      checkNameNodeSafeMode("Cannot set storage policy for " + src);
2238      auditStat = FSDirAttrOp.setStoragePolicy(
2239          dir, blockManager, src, policyName);
2240    } catch (AccessControlException e) {
2241      logAuditEvent(false, operationName, src);
2242      throw e;
2243    } finally {
2244      writeUnlock(operationName);
2245    }
2246    getEditLog().logSync();
2247    logAuditEvent(true, operationName, src, null, auditStat);
2248  }
2249
2250  /**
2251   * @return All the existing block storage policies
2252   */
2253  BlockStoragePolicy[] getStoragePolicies() throws IOException {
2254    checkOperation(OperationCategory.READ);
2255    waitForLoadingFSImage();
2256    readLock();
2257    try {
2258      checkOperation(OperationCategory.READ);
2259      return FSDirAttrOp.getStoragePolicies(blockManager);
2260    } finally {
2261      readUnlock("getStoragePolicies");
2262    }
2263  }
2264
2265  long getPreferredBlockSize(String src) throws IOException {
2266    checkOperation(OperationCategory.READ);
2267    readLock();
2268    try {
2269      checkOperation(OperationCategory.READ);
2270      return FSDirAttrOp.getPreferredBlockSize(dir, src);
2271    } finally {
2272      readUnlock("getPreferredBlockSize");
2273    }
2274  }
2275
2276  /**
2277   * If the file is within an encryption zone, select the appropriate 
2278   * CryptoProtocolVersion from the list provided by the client. Since the
2279   * client may be newer, we need to handle unknown versions.
2280   *
2281   * @param zone EncryptionZone of the file
2282   * @param supportedVersions List of supported protocol versions
2283   * @return chosen protocol version
2284   * @throws IOException
2285   */
2286  private CryptoProtocolVersion chooseProtocolVersion(EncryptionZone zone,
2287      CryptoProtocolVersion[] supportedVersions)
2288      throws UnknownCryptoProtocolVersionException, UnresolvedLinkException,
2289        SnapshotAccessControlException {
2290    Preconditions.checkNotNull(zone);
2291    Preconditions.checkNotNull(supportedVersions);
2292    // Right now, we only support a single protocol version,
2293    // so simply look for it in the list of provided options
2294    final CryptoProtocolVersion required = zone.getVersion();
2295
2296    for (CryptoProtocolVersion c : supportedVersions) {
2297      if (c.equals(CryptoProtocolVersion.UNKNOWN)) {
2298        if (LOG.isDebugEnabled()) {
2299          LOG.debug("Ignoring unknown CryptoProtocolVersion provided by " +
2300              "client: " + c.getUnknownValue());
2301        }
2302        continue;
2303      }
2304      if (c.equals(required)) {
2305        return c;
2306      }
2307    }
2308    throw new UnknownCryptoProtocolVersionException(
2309        "No crypto protocol versions provided by the client are supported."
2310            + " Client provided: " + Arrays.toString(supportedVersions)
2311            + " NameNode supports: " + Arrays.toString(CryptoProtocolVersion
2312            .values()));
2313  }
2314
2315  /**
2316   * Invoke KeyProvider APIs to generate an encrypted data encryption key for an
2317   * encryption zone. Should not be called with any locks held.
2318   *
2319   * @param ezKeyName key name of an encryption zone
2320   * @return New EDEK, or null if ezKeyName is null
2321   * @throws IOException
2322   */
2323  private EncryptedKeyVersion generateEncryptedDataEncryptionKey(String
2324      ezKeyName) throws IOException {
2325    if (ezKeyName == null) {
2326      return null;
2327    }
2328    EncryptedKeyVersion edek = null;
2329    try {
2330      edek = provider.generateEncryptedKey(ezKeyName);
2331    } catch (GeneralSecurityException e) {
2332      throw new IOException(e);
2333    }
2334    Preconditions.checkNotNull(edek);
2335    return edek;
2336  }
2337
2338  /**
2339   * Create a new file entry in the namespace.
2340   * 
2341   * For description of parameters and exceptions thrown see
2342   * {@link ClientProtocol#create}, except it returns valid file status upon
2343   * success
2344   */
2345  HdfsFileStatus startFile(String src, PermissionStatus permissions,
2346      String holder, String clientMachine, EnumSet<CreateFlag> flag,
2347      boolean createParent, short replication, long blockSize, 
2348      CryptoProtocolVersion[] supportedVersions, boolean logRetryCache)
2349      throws AccessControlException, SafeModeException,
2350      FileAlreadyExistsException, UnresolvedLinkException,
2351      FileNotFoundException, ParentNotDirectoryException, IOException {
2352
2353    HdfsFileStatus status = null;
2354    try {
2355      status = startFileInt(src, permissions, holder, clientMachine, flag,
2356          createParent, replication, blockSize, supportedVersions,
2357          logRetryCache);
2358    } catch (AccessControlException e) {
2359      logAuditEvent(false, "create", src);
2360      throw e;
2361    }
2362    return status;
2363  }
2364
2365  private HdfsFileStatus startFileInt(final String srcArg,
2366      PermissionStatus permissions, String holder, String clientMachine,
2367      EnumSet<CreateFlag> flag, boolean createParent, short replication,
2368      long blockSize, CryptoProtocolVersion[] supportedVersions,
2369      boolean logRetryCache)
2370      throws AccessControlException, SafeModeException,
2371      FileAlreadyExistsException, UnresolvedLinkException,
2372      FileNotFoundException, ParentNotDirectoryException, IOException {
2373    String src = srcArg;
2374    final String operationName = "create";
2375    if (NameNode.stateChangeLog.isDebugEnabled()) {
2376      StringBuilder builder = new StringBuilder();
2377      builder.append("DIR* NameSystem.startFile: src=" + src
2378              + ", holder=" + holder
2379              + ", clientMachine=" + clientMachine
2380              + ", createParent=" + createParent
2381              + ", replication=" + replication
2382              + ", createFlag=" + flag.toString()
2383              + ", blockSize=" + blockSize);
2384      builder.append(", supportedVersions=");
2385      if (supportedVersions != null) {
2386        builder.append(Arrays.toString(supportedVersions));
2387      } else {
2388        builder.append("null");
2389      }
2390      NameNode.stateChangeLog.debug(builder.toString());
2391    }
2392    if (!DFSUtil.isValidName(src)) {
2393      throw new InvalidPathException(src);
2394    }
2395    blockManager.verifyReplication(src, replication, clientMachine);
2396
2397    boolean skipSync = false;
2398    HdfsFileStatus stat = null;
2399    FSPermissionChecker pc = getPermissionChecker();
2400    if (blockSize < minBlockSize) {
2401      throw new IOException("Specified block size is less than configured" +
2402          " minimum value (" + DFSConfigKeys.DFS_NAMENODE_MIN_BLOCK_SIZE_KEY
2403          + "): " + blockSize + " < " + minBlockSize);
2404    }
2405    boolean create = flag.contains(CreateFlag.CREATE);
2406    boolean overwrite = flag.contains(CreateFlag.OVERWRITE);
2407    boolean isLazyPersist = flag.contains(CreateFlag.LAZY_PERSIST);
2408
2409    waitForLoadingFSImage();
2410
2411    /**
2412     * If the file is in an encryption zone, we optimistically create an
2413     * EDEK for the file by calling out to the configured KeyProvider.
2414     * Since this typically involves doing an RPC, we take the readLock
2415     * initially, then drop it to do the RPC.
2416     * 
2417     * Since the path can flip-flop between being in an encryption zone and not
2418     * in the meantime, we need to recheck the preconditions when we retake the
2419     * lock to do the create. If the preconditions are not met, we throw a
2420     * special RetryStartFileException to ask the DFSClient to try the create
2421     * again later.
2422     */
2423    CryptoProtocolVersion protocolVersion = null;
2424    CipherSuite suite = null;
2425    String ezKeyName = null;
2426    EncryptedKeyVersion edek = null;
2427
2428    if (provider != null) {
2429      readLock();
2430      try {
2431        INodesInPath iip = dir.resolvePathForWrite(pc, src);
2432        src = iip.getPath();
2433        // Nothing to do if the path is not within an EZ
2434        final EncryptionZone zone = dir.getEZForPath(iip);
2435        if (zone != null) {
2436          protocolVersion = chooseProtocolVersion(zone, supportedVersions);
2437          suite = zone.getSuite();
2438          ezKeyName = zone.getKeyName();
2439
2440          Preconditions.checkNotNull(protocolVersion);
2441          Preconditions.checkNotNull(suite);
2442          Preconditions.checkArgument(!suite.equals(CipherSuite.UNKNOWN),
2443              "Chose an UNKNOWN CipherSuite!");
2444          Preconditions.checkNotNull(ezKeyName);
2445        }
2446      } finally {
2447        readUnlock(operationName);
2448      }
2449
2450      Preconditions.checkState(
2451          (suite == null && ezKeyName == null) ||
2452              (suite != null && ezKeyName != null),
2453          "Both suite and ezKeyName should both be null or not null");
2454
2455      // Generate EDEK if necessary while not holding the lock
2456      edek = generateEncryptedDataEncryptionKey(ezKeyName);
2457      EncryptionFaultInjector.getInstance().startFileAfterGenerateKey();
2458    }
2459
2460    // Proceed with the create, using the computed cipher suite and 
2461    // generated EDEK
2462    BlocksMapUpdateInfo toRemoveBlocks = null;
2463    writeLock();
2464    try {
2465      checkOperation(OperationCategory.WRITE);
2466      checkNameNodeSafeMode("Cannot create file" + src);
2467      dir.writeLock();
2468      try {
2469        final INodesInPath iip = dir.resolvePathForWrite(pc, src);
2470        src = iip.getPath();
2471        toRemoveBlocks = startFileInternal(
2472            pc, iip, permissions, holder,
2473            clientMachine, create, overwrite,
2474            createParent, replication, blockSize,
2475            isLazyPersist, suite, protocolVersion, edek,
2476            logRetryCache);
2477        stat = FSDirStatAndListingOp.getFileInfo(
2478            dir, src, false, FSDirectory.isReservedRawName(srcArg));
2479      } finally {
2480        dir.writeUnlock();
2481      }
2482    } catch (StandbyException se) {
2483      skipSync = true;
2484      throw se;
2485    } finally {
2486      writeUnlock(operationName);
2487      // There might be transactions logged while trying to recover the lease.
2488      // They need to be sync'ed even when an exception was thrown.
2489      if (!skipSync) {
2490        getEditLog().logSync();
2491        if (toRemoveBlocks != null) {
2492          removeBlocks(toRemoveBlocks);
2493          toRemoveBlocks.clear();
2494        }
2495      }
2496    }
2497
2498    logAuditEvent(true, operationName, srcArg, null, stat);
2499    return stat;
2500  }
2501
2502  /**
2503   * Create a new file or overwrite an existing file<br>
2504   * 
2505   * Once the file is create the client then allocates a new block with the next
2506   * call using {@link ClientProtocol#addBlock}.
2507   * <p>
2508   * For description of parameters and exceptions thrown see
2509   * {@link ClientProtocol#create}
2510   */
2511  private BlocksMapUpdateInfo startFileInternal(FSPermissionChecker pc, 
2512      INodesInPath iip, PermissionStatus permissions, String holder,
2513      String clientMachine, boolean create, boolean overwrite, 
2514      boolean createParent, short replication, long blockSize, 
2515      boolean isLazyPersist, CipherSuite suite, CryptoProtocolVersion version,
2516      EncryptedKeyVersion edek, boolean logRetryEntry)
2517      throws IOException {
2518    assert hasWriteLock();
2519    // Verify that the destination does not exist as a directory already.
2520    final INode inode = iip.getLastINode();
2521    final String src = iip.getPath();
2522    if (inode != null && inode.isDirectory()) {
2523      throw new FileAlreadyExistsException(src +
2524          " already exists as a directory");
2525    }
2526
2527    final INodeFile myFile = INodeFile.valueOf(inode, src, true);
2528    if (isPermissionEnabled) {
2529      if (overwrite && myFile != null) {
2530        dir.checkPathAccess(pc, iip, FsAction.WRITE);
2531      }
2532      /*
2533       * To overwrite existing file, need to check 'w' permission 
2534       * of parent (equals to ancestor in this case)
2535       */
2536      dir.checkAncestorAccess(pc, iip, FsAction.WRITE);
2537    }
2538    if (!createParent) {
2539      dir.verifyParentDir(iip, src);
2540    }
2541
2542    FileEncryptionInfo feInfo = null;
2543
2544    final EncryptionZone zone = dir.getEZForPath(iip);
2545    if (zone != null) {
2546      // The path is now within an EZ, but we're missing encryption parameters
2547      if (suite == null || edek == null) {
2548        throw new RetryStartFileException();
2549      }
2550      // Path is within an EZ and we have provided encryption parameters.
2551      // Make sure that the generated EDEK matches the settings of the EZ.
2552      final String ezKeyName = zone.getKeyName();
2553      if (!ezKeyName.equals(edek.getEncryptionKeyName())) {
2554        throw new RetryStartFileException();
2555      }
2556      feInfo = new FileEncryptionInfo(suite, version,
2557          edek.getEncryptedKeyVersion().getMaterial(),
2558          edek.getEncryptedKeyIv(),
2559          ezKeyName, edek.getEncryptionKeyVersionName());
2560    }
2561
2562    try {
2563      BlocksMapUpdateInfo toRemoveBlocks = null;
2564      if (myFile == null) {
2565        if (!create) {
2566          throw new FileNotFoundException("Can't overwrite non-existent " +
2567              src + " for client " + clientMachine);
2568        }
2569      } else {
2570        if (overwrite) {
2571          toRemoveBlocks = new BlocksMapUpdateInfo();
2572          List<INode> toRemoveINodes = new ChunkedArrayList<INode>();
2573          long ret = FSDirDeleteOp.delete(dir, iip, toRemoveBlocks,
2574                                          toRemoveINodes, now());
2575          if (ret >= 0) {
2576            iip = INodesInPath.replace(iip, iip.length() - 1, null);
2577            FSDirDeleteOp.incrDeletedFileCount(ret);
2578            removeLeasesAndINodes(src, toRemoveINodes, true);
2579          }
2580        } else {
2581          // If lease soft limit time is expired, recover the lease
2582          recoverLeaseInternal(RecoverLeaseOp.CREATE_FILE,
2583              iip, src, holder, clientMachine, false);
2584          throw new FileAlreadyExistsException(src + " for client " +
2585              clientMachine + " already exists");
2586        }
2587      }
2588
2589      checkFsObjectLimit();
2590      INodeFile newNode = null;
2591
2592      // Always do an implicit mkdirs for parent directory tree.
2593      Map.Entry<INodesInPath, String> parent = FSDirMkdirOp
2594          .createAncestorDirectories(dir, iip, permissions);
2595      if (parent != null) {
2596        iip = dir.addFile(parent.getKey(), parent.getValue(), permissions,
2597            replication, blockSize, holder, clientMachine);
2598        newNode = iip != null ? iip.getLastINode().asFile() : null;
2599      }
2600
2601      if (newNode == null) {
2602        throw new IOException("Unable to add " + src +  " to namespace");
2603      }
2604      leaseManager.addLease(newNode.getFileUnderConstructionFeature()
2605          .getClientName(), src);
2606
2607      // Set encryption attributes if necessary
2608      if (feInfo != null) {
2609        dir.setFileEncryptionInfo(src, feInfo);
2610        newNode = dir.getInode(newNode.getId()).asFile();
2611      }
2612
2613      setNewINodeStoragePolicy(newNode, iip, isLazyPersist);
2614
2615      // record file record in log, record new generation stamp
2616      getEditLog().logOpenFile(src, newNode, overwrite, logRetryEntry);
2617      NameNode.stateChangeLog.debug("DIR* NameSystem.startFile: added {}" +
2618          " inode {} holder {}", src, newNode.getId(), holder);
2619      return toRemoveBlocks;
2620    } catch (IOException ie) {
2621      NameNode.stateChangeLog.warn("DIR* NameSystem.startFile: " + src + " " +
2622          ie.getMessage());
2623      throw ie;
2624    }
2625  }
2626
2627  private void setNewINodeStoragePolicy(INodeFile inode,
2628                                        INodesInPath iip,
2629                                        boolean isLazyPersist)
2630      throws IOException {
2631
2632    if (isLazyPersist) {
2633      BlockStoragePolicy lpPolicy =
2634          blockManager.getStoragePolicy("LAZY_PERSIST");
2635
2636      // Set LAZY_PERSIST storage policy if the flag was passed to
2637      // CreateFile.
2638      if (lpPolicy == null) {
2639        throw new HadoopIllegalArgumentException(
2640            "The LAZY_PERSIST storage policy has been disabled " +
2641            "by the administrator.");
2642      }
2643      inode.setStoragePolicyID(lpPolicy.getId(),
2644                                 iip.getLatestSnapshotId());
2645    } else {
2646      BlockStoragePolicy effectivePolicy =
2647          blockManager.getStoragePolicy(inode.getStoragePolicyID());
2648
2649      if (effectivePolicy != null &&
2650          effectivePolicy.isCopyOnCreateFile()) {
2651        // Copy effective policy from ancestor directory to current file.
2652        inode.setStoragePolicyID(effectivePolicy.getId(),
2653                                 iip.getLatestSnapshotId());
2654      }
2655    }
2656  }
2657
2658  /**
2659   * Append to an existing file for append.
2660   * <p>
2661   * 
2662   * The method returns the last block of the file if this is a partial block,
2663   * which can still be used for writing more data. The client uses the returned
2664   * block locations to form the data pipeline for this block.<br>
2665   * The method returns null if the last block is full. The client then
2666   * allocates a new block with the next call using
2667   * {@link ClientProtocol#addBlock}.
2668   * <p>
2669   * 
2670   * For description of parameters and exceptions thrown see
2671   * {@link ClientProtocol#append(String, String, EnumSetWritable)}
2672   *
2673   * @return the last block locations if the block is partial or null otherwise
2674   */
2675  private LocatedBlock appendFileInternal(FSPermissionChecker pc,
2676      INodesInPath iip, String holder, String clientMachine, boolean newBlock,
2677      boolean logRetryCache) throws IOException {
2678    assert hasWriteLock();
2679    // Verify that the destination does not exist as a directory already.
2680    final INode inode = iip.getLastINode();
2681    final String src = iip.getPath();
2682    if (inode != null && inode.isDirectory()) {
2683      throw new FileAlreadyExistsException("Cannot append to directory " + src
2684          + "; already exists as a directory.");
2685    }
2686    if (isPermissionEnabled) {
2687      dir.checkPathAccess(pc, iip, FsAction.WRITE);
2688    }
2689
2690    try {
2691      if (inode == null) {
2692        throw new FileNotFoundException("failed to append to non-existent file "
2693          + src + " for client " + clientMachine);
2694      }
2695      INodeFile myFile = INodeFile.valueOf(inode, src, true);
2696      final BlockStoragePolicy lpPolicy =
2697          blockManager.getStoragePolicy("LAZY_PERSIST");
2698      if (lpPolicy != null &&
2699          lpPolicy.getId() == myFile.getStoragePolicyID()) {
2700        throw new UnsupportedOperationException(
2701            "Cannot append to lazy persist file " + src);
2702      }
2703      // Opening an existing file for append - may need to recover lease.
2704      recoverLeaseInternal(RecoverLeaseOp.APPEND_FILE,
2705          iip, src, holder, clientMachine, false);
2706      
2707      final BlockInfoContiguous lastBlock = myFile.getLastBlock();
2708      // Check that the block has at least minimum replication.
2709      if(lastBlock != null && lastBlock.isComplete() &&
2710          !getBlockManager().isSufficientlyReplicated(lastBlock)) {
2711        throw new IOException("append: lastBlock=" + lastBlock +
2712            " of src=" + src + " is not sufficiently replicated yet.");
2713      }
2714      return prepareFileForAppend(src, iip, holder, clientMachine, newBlock,
2715          true, logRetryCache);
2716    } catch (IOException ie) {
2717      NameNode.stateChangeLog.warn("DIR* NameSystem.append: " +ie.getMessage());
2718      throw ie;
2719    }
2720  }
2721  
2722  /**
2723   * Convert current node to under construction.
2724   * Recreate in-memory lease record.
2725   * 
2726   * @param src path to the file
2727   * @param leaseHolder identifier of the lease holder on this file
2728   * @param clientMachine identifier of the client machine
2729   * @param newBlock if the data is appended to a new block
2730   * @param writeToEditLog whether to persist this change to the edit log
2731   * @param logRetryCache whether to record RPC ids in editlog for retry cache
2732   *                      rebuilding
2733   * @return the last block locations if the block is partial or null otherwise
2734   * @throws UnresolvedLinkException
2735   * @throws IOException
2736   */
2737  LocatedBlock prepareFileForAppend(String src, INodesInPath iip,
2738      String leaseHolder, String clientMachine, boolean newBlock,
2739      boolean writeToEditLog, boolean logRetryCache) throws IOException {
2740    final INodeFile file = iip.getLastINode().asFile();
2741    final QuotaCounts delta = verifyQuotaForUCBlock(file, iip);
2742
2743    file.recordModification(iip.getLatestSnapshotId());
2744    file.toUnderConstruction(leaseHolder, clientMachine);
2745
2746    leaseManager.addLease(
2747        file.getFileUnderConstructionFeature().getClientName(), src);
2748
2749    LocatedBlock ret = null;
2750    if (!newBlock) {
2751      ret = blockManager.convertLastBlockToUnderConstruction(file, 0);
2752      if (ret != null && delta != null) {
2753        Preconditions.checkState(delta.getStorageSpace() >= 0,
2754            "appending to a block with size larger than the preferred block size");
2755        dir.writeLock();
2756        try {
2757          dir.updateCountNoQuotaCheck(iip, iip.length() - 1, delta);
2758        } finally {
2759          dir.writeUnlock();
2760        }
2761      }
2762    } else {
2763      BlockInfoContiguous lastBlock = file.getLastBlock();
2764      if (lastBlock != null) {
2765        ExtendedBlock blk = new ExtendedBlock(this.getBlockPoolId(), lastBlock);
2766        ret = new LocatedBlock(blk, new DatanodeInfo[0]);
2767      }
2768    }
2769
2770    if (writeToEditLog) {
2771      getEditLog().logAppendFile(src, file, newBlock, logRetryCache);
2772    }
2773    return ret;
2774  }
2775
2776  /**
2777   * Verify quota when using the preferred block size for UC block. This is
2778   * usually used by append and truncate
2779   * @throws QuotaExceededException when violating the storage quota
2780   * @return expected quota usage update. null means no change or no need to
2781   *         update quota usage later
2782   */
2783  private QuotaCounts verifyQuotaForUCBlock(INodeFile file, INodesInPath iip)
2784      throws QuotaExceededException {
2785    if (!isImageLoaded() || dir.shouldSkipQuotaChecks()) {
2786      // Do not check quota if editlog is still being processed
2787      return null;
2788    }
2789    if (file.getLastBlock() != null) {
2790      final QuotaCounts delta = computeQuotaDeltaForUCBlock(file);
2791      dir.readLock();
2792      try {
2793        FSDirectory.verifyQuota(iip, iip.length() - 1, delta, null);
2794        return delta;
2795      } finally {
2796        dir.readUnlock();
2797      }
2798    }
2799    return null;
2800  }
2801
2802  /** Compute quota change for converting a complete block to a UC block */
2803  private QuotaCounts computeQuotaDeltaForUCBlock(INodeFile file) {
2804    final QuotaCounts delta = new QuotaCounts.Builder().build();
2805    final BlockInfoContiguous lastBlock = file.getLastBlock();
2806    if (lastBlock != null) {
2807      final long diff = file.getPreferredBlockSize() - lastBlock.getNumBytes();
2808      final short repl = file.getBlockReplication();
2809      delta.addStorageSpace(diff * repl);
2810      final BlockStoragePolicy policy = dir.getBlockStoragePolicySuite()
2811          .getPolicy(file.getStoragePolicyID());
2812      List<StorageType> types = policy.chooseStorageTypes(repl);
2813      for (StorageType t : types) {
2814        if (t.supportTypeQuota()) {
2815          delta.addTypeSpace(t, diff);
2816        }
2817      }
2818    }
2819    return delta;
2820  }
2821
2822  /**
2823   * Recover lease;
2824   * Immediately revoke the lease of the current lease holder and start lease
2825   * recovery so that the file can be forced to be closed.
2826   * 
2827   * @param src the path of the file to start lease recovery
2828   * @param holder the lease holder's name
2829   * @param clientMachine the client machine's name
2830   * @return true if the file is already closed or
2831   *         if the lease can be released and the file can be closed.
2832   * @throws IOException
2833   */
2834  boolean recoverLease(String src, String holder, String clientMachine)
2835      throws IOException {
2836    if (!DFSUtil.isValidName(src)) {
2837      throw new IOException("Invalid file name: " + src);
2838    }
2839  
2840    boolean skipSync = false;
2841    FSPermissionChecker pc = getPermissionChecker();
2842    checkOperation(OperationCategory.WRITE);
2843    writeLock();
2844    try {
2845      checkOperation(OperationCategory.WRITE);
2846      checkNameNodeSafeMode("Cannot recover the lease of " + src);
2847      final INodesInPath iip = dir.resolvePathForWrite(pc, src);
2848      src = iip.getPath();
2849      final INodeFile inode = INodeFile.valueOf(iip.getLastINode(), src);
2850      if (!inode.isUnderConstruction()) {
2851        return true;
2852      }
2853      if (isPermissionEnabled) {
2854        dir.checkPathAccess(pc, iip, FsAction.WRITE);
2855      }
2856  
2857      return recoverLeaseInternal(RecoverLeaseOp.RECOVER_LEASE,
2858          iip, src, holder, clientMachine, true);
2859    } catch (StandbyException se) {
2860      skipSync = true;
2861      throw se;
2862    } finally {
2863      writeUnlock("recoverLease");
2864      // There might be transactions logged while trying to recover the lease.
2865      // They need to be sync'ed even when an exception was thrown.
2866      if (!skipSync) {
2867        getEditLog().logSync();
2868      }
2869    }
2870  }
2871
2872  private enum RecoverLeaseOp {
2873    CREATE_FILE,
2874    APPEND_FILE,
2875    TRUNCATE_FILE,
2876    RECOVER_LEASE;
2877    
2878    private String getExceptionMessage(String src, String holder,
2879        String clientMachine, String reason) {
2880      return "Failed to " + this + " " + src + " for " + holder +
2881          " on " + clientMachine + " because " + reason;
2882    }
2883  }
2884
2885  boolean recoverLeaseInternal(RecoverLeaseOp op, INodesInPath iip,
2886      String src, String holder, String clientMachine, boolean force)
2887      throws IOException {
2888    assert hasWriteLock();
2889    INodeFile file = iip.getLastINode().asFile();
2890    if (file.isUnderConstruction()) {
2891      //
2892      // If the file is under construction , then it must be in our
2893      // leases. Find the appropriate lease record.
2894      //
2895      Lease lease = leaseManager.getLease(holder);
2896
2897      if (!force && lease != null) {
2898        Lease leaseFile = leaseManager.getLeaseByPath(src);
2899        if (leaseFile != null && leaseFile.equals(lease)) {
2900          // We found the lease for this file but the original
2901          // holder is trying to obtain it again.
2902          throw new AlreadyBeingCreatedException(
2903              op.getExceptionMessage(src, holder, clientMachine,
2904                  holder + " is already the current lease holder."));
2905        }
2906      }
2907      //
2908      // Find the original holder.
2909      //
2910      FileUnderConstructionFeature uc = file.getFileUnderConstructionFeature();
2911      String clientName = uc.getClientName();
2912      lease = leaseManager.getLease(clientName);
2913      if (lease == null) {
2914        throw new AlreadyBeingCreatedException(
2915            op.getExceptionMessage(src, holder, clientMachine,
2916                "the file is under construction but no leases found."));
2917      }
2918      if (force) {
2919        // close now: no need to wait for soft lease expiration and 
2920        // close only the file src
2921        LOG.info("recoverLease: " + lease + ", src=" + src +
2922          " from client " + clientName);
2923        return internalReleaseLease(lease, src, iip, holder);
2924      } else {
2925        assert lease.getHolder().equals(clientName) :
2926          "Current lease holder " + lease.getHolder() +
2927          " does not match file creator " + clientName;
2928        //
2929        // If the original holder has not renewed in the last SOFTLIMIT 
2930        // period, then start lease recovery.
2931        //
2932        if (lease.expiredSoftLimit()) {
2933          LOG.info("startFile: recover " + lease + ", src=" + src + " client "
2934              + clientName);
2935          if (internalReleaseLease(lease, src, iip, null)) {
2936            return true;
2937          } else {
2938            throw new RecoveryInProgressException(
2939                op.getExceptionMessage(src, holder, clientMachine,
2940                    "lease recovery is in progress. Try again later."));
2941          }
2942        } else {
2943          final BlockInfoContiguous lastBlock = file.getLastBlock();
2944          if (lastBlock != null
2945              && lastBlock.getBlockUCState() == BlockUCState.UNDER_RECOVERY) {
2946            throw new RecoveryInProgressException(
2947                op.getExceptionMessage(src, holder, clientMachine,
2948                    "another recovery is in progress by "
2949                        + clientName + " on " + uc.getClientMachine()));
2950          } else {
2951            throw new AlreadyBeingCreatedException(
2952                op.getExceptionMessage(src, holder, clientMachine,
2953                    "this file lease is currently owned by "
2954                        + clientName + " on " + uc.getClientMachine()));
2955          }
2956        }
2957      }
2958    } else {
2959      return true;
2960     }
2961  }
2962
2963  /**
2964   * Append to an existing file in the namespace.
2965   */
2966  LastBlockWithStatus appendFile(String src, String holder,
2967      String clientMachine, EnumSet<CreateFlag> flag, boolean logRetryCache)
2968      throws IOException {
2969    try {
2970      return appendFileInt(src, holder, clientMachine,
2971          flag.contains(CreateFlag.NEW_BLOCK), logRetryCache);
2972    } catch (AccessControlException e) {
2973      logAuditEvent(false, "append", src);
2974      throw e;
2975    }
2976  }
2977
2978  private LastBlockWithStatus appendFileInt(final String srcArg, String holder,
2979      String clientMachine, boolean newBlock, boolean logRetryCache)
2980      throws IOException {
2981    String src = srcArg;
2982    final String operationName = "append";
2983    NameNode.stateChangeLog.debug(
2984        "DIR* NameSystem.appendFile: src={}, holder={}, clientMachine={}",
2985        src, holder, clientMachine);
2986    boolean skipSync = false;
2987    if (!supportAppends) {
2988      throw new UnsupportedOperationException(
2989          "Append is not enabled on this NameNode. Use the " +
2990          DFS_SUPPORT_APPEND_KEY + " configuration option to enable it.");
2991    }
2992
2993    LocatedBlock lb = null;
2994    HdfsFileStatus stat = null;
2995    FSPermissionChecker pc = getPermissionChecker();
2996    writeLock();
2997    try {
2998      checkOperation(OperationCategory.WRITE);
2999      checkNameNodeSafeMode("Cannot append to file" + src);
3000      final INodesInPath iip = dir.resolvePathForWrite(pc, src);
3001      src = iip.getPath();
3002      lb = appendFileInternal(pc, iip, holder, clientMachine, newBlock,
3003          logRetryCache);
3004      stat = FSDirStatAndListingOp.getFileInfo(dir, src, false,
3005          FSDirectory.isReservedRawName(srcArg));
3006    } catch (StandbyException se) {
3007      skipSync = true;
3008      throw se;
3009    } finally {
3010      writeUnlock(operationName);
3011      // There might be transactions logged while trying to recover the lease.
3012      // They need to be sync'ed even when an exception was thrown.
3013      if (!skipSync) {
3014        getEditLog().logSync();
3015      }
3016    }
3017    if (lb != null) {
3018      NameNode.stateChangeLog.debug(
3019          "DIR* NameSystem.appendFile: file {} for {} at {} block {} block" +
3020          " size {}", src, holder, clientMachine, lb.getBlock(),
3021          lb.getBlock().getNumBytes());
3022    }
3023    logAuditEvent(true, operationName, srcArg);
3024    return new LastBlockWithStatus(lb, stat);
3025  }
3026
3027  ExtendedBlock getExtendedBlock(Block blk) {
3028    return new ExtendedBlock(blockPoolId, blk);
3029  }
3030  
3031  void setBlockPoolId(String bpid) {
3032    blockPoolId = bpid;
3033    blockManager.setBlockPoolId(blockPoolId);
3034  }
3035
3036  /**
3037   * The client would like to obtain an additional block for the indicated
3038   * filename (which is being written-to).  Return an array that consists
3039   * of the block, plus a set of machines.  The first on this list should
3040   * be where the client writes data.  Subsequent items in the list must
3041   * be provided in the connection to the first datanode.
3042   *
3043   * Make sure the previous blocks have been reported by datanodes and
3044   * are replicated.  Will return an empty 2-elt array if we want the
3045   * client to "try again later".
3046   */
3047  LocatedBlock getAdditionalBlock(String src, long fileId, String clientName,
3048      ExtendedBlock previous, Set<Node> excludedNodes, 
3049      List<String> favoredNodes) throws IOException {
3050    LocatedBlock[] onRetryBlock = new LocatedBlock[1];
3051    DatanodeStorageInfo targets[] = getNewBlockTargets(src, fileId,
3052        clientName, previous, excludedNodes, favoredNodes, onRetryBlock);
3053    if (targets == null) {
3054      assert onRetryBlock[0] != null : "Retry block is null";
3055      // This is a retry. Just return the last block.
3056      return onRetryBlock[0];
3057    }
3058    LocatedBlock newBlock = storeAllocatedBlock(
3059        src, fileId, clientName, previous, targets);
3060    return newBlock;
3061  }
3062
3063  /**
3064   * Part I of getAdditionalBlock().
3065   * Analyze the state of the file under read lock to determine if the client
3066   * can add a new block, detect potential retries, lease mismatches,
3067   * and minimal replication of the penultimate block.
3068   * 
3069   * Generate target DataNode locations for the new block,
3070   * but do not create the new block yet.
3071   */
3072  DatanodeStorageInfo[] getNewBlockTargets(String src, long fileId,
3073      String clientName, ExtendedBlock previous, Set<Node> excludedNodes,
3074      List<String> favoredNodes, LocatedBlock[] onRetryBlock) throws IOException {
3075    final long blockSize;
3076    final int replication;
3077    final byte storagePolicyID;
3078    Node clientNode = null;
3079    String clientMachine = null;
3080
3081    NameNode.stateChangeLog.debug("BLOCK* getAdditionalBlock: {}  inodeId {}" +
3082        " for {}", src, fileId, clientName);
3083
3084    checkOperation(OperationCategory.READ);
3085    FSPermissionChecker pc = getPermissionChecker();
3086    readLock();
3087    try {
3088      checkOperation(OperationCategory.READ);
3089      INodesInPath iip = dir.resolvePath(pc, src, fileId);
3090      src = iip.getPath();
3091      FileState fileState = analyzeFileState(
3092          iip, fileId, clientName, previous, onRetryBlock);
3093      if (onRetryBlock[0] != null && onRetryBlock[0].getLocations().length > 0) {
3094        // This is a retry. No need to generate new locations.
3095        // Use the last block if it has locations.
3096        return null;
3097      }
3098
3099      final INodeFile pendingFile = fileState.inode;
3100      if (!checkFileProgress(src, pendingFile, false)) {
3101        throw new NotReplicatedYetException("Not replicated yet: " + src);
3102      }
3103      src = fileState.path;
3104
3105      if (pendingFile.getBlocks().length >= maxBlocksPerFile) {
3106        throw new IOException("File has reached the limit on maximum number of"
3107            + " blocks (" + DFSConfigKeys.DFS_NAMENODE_MAX_BLOCKS_PER_FILE_KEY
3108            + "): " + pendingFile.getBlocks().length + " >= "
3109            + maxBlocksPerFile);
3110      }
3111      blockSize = pendingFile.getPreferredBlockSize();
3112      clientMachine = pendingFile.getFileUnderConstructionFeature()
3113          .getClientMachine();
3114      clientNode = blockManager.getDatanodeManager().getDatanodeByHost(
3115          clientMachine);
3116      replication = pendingFile.getFileReplication();
3117      storagePolicyID = pendingFile.getStoragePolicyID();
3118    } finally {
3119      readUnlock("getNewBlockTargets");
3120    }
3121
3122    if (clientNode == null) {
3123      clientNode = getClientNode(clientMachine);
3124    }
3125
3126    // choose targets for the new block to be allocated.
3127    return getBlockManager().chooseTarget4NewBlock( 
3128        src, replication, clientNode, excludedNodes, blockSize, favoredNodes,
3129        storagePolicyID);
3130  }
3131
3132  /**
3133   * Part II of getAdditionalBlock().
3134   * Should repeat the same analysis of the file state as in Part 1,
3135   * but under the write lock.
3136   * If the conditions still hold, then allocate a new block with
3137   * the new targets, add it to the INode and to the BlocksMap.
3138   */
3139  LocatedBlock storeAllocatedBlock(String src, long fileId, String clientName,
3140      ExtendedBlock previous, DatanodeStorageInfo[] targets) throws IOException {
3141    Block newBlock = null;
3142    long offset;
3143    checkOperation(OperationCategory.WRITE);
3144    waitForLoadingFSImage();
3145    writeLock();
3146    try {
3147      checkOperation(OperationCategory.WRITE);
3148      // Run the full analysis again, since things could have changed
3149      // while chooseTarget() was executing.
3150      LocatedBlock[] onRetryBlock = new LocatedBlock[1];
3151      final INodesInPath iip = dir.resolvePath(null, src, fileId);
3152      FileState fileState = 
3153          analyzeFileState(iip, fileId, clientName, previous, onRetryBlock);
3154      final INodeFile pendingFile = fileState.inode;
3155      src = fileState.path;
3156
3157      if (onRetryBlock[0] != null) {
3158        if (onRetryBlock[0].getLocations().length > 0) {
3159          // This is a retry. Just return the last block if having locations.
3160          return onRetryBlock[0];
3161        } else {
3162          // add new chosen targets to already allocated block and return
3163          BlockInfoContiguous lastBlockInFile = pendingFile.getLastBlock();
3164          ((BlockInfoContiguousUnderConstruction) lastBlockInFile)
3165              .setExpectedLocations(targets);
3166          offset = pendingFile.computeFileSize();
3167          return makeLocatedBlock(lastBlockInFile, targets, offset);
3168        }
3169      }
3170
3171      // commit the last block and complete it if it has minimum replicas
3172      commitOrCompleteLastBlock(pendingFile, fileState.iip,
3173                                ExtendedBlock.getLocalBlock(previous));
3174
3175      // allocate new block, record block locations in INode.
3176      newBlock = createNewBlock();
3177      INodesInPath inodesInPath = INodesInPath.fromINode(pendingFile);
3178      saveAllocatedBlock(src, inodesInPath, newBlock, targets);
3179
3180      persistNewBlock(src, pendingFile);
3181      offset = pendingFile.computeFileSize();
3182    } finally {
3183      writeUnlock("storeAllocatedBlock");
3184    }
3185    getEditLog().logSync();
3186
3187    // Return located block
3188    return makeLocatedBlock(newBlock, targets, offset);
3189  }
3190
3191  /*
3192   * Resolve clientmachine address to get a network location path
3193   */
3194  private Node getClientNode(String clientMachine) {
3195    List<String> hosts = new ArrayList<String>(1);
3196    hosts.add(clientMachine);
3197    List<String> rName = getBlockManager().getDatanodeManager()
3198        .resolveNetworkLocation(hosts);
3199    Node clientNode = null;
3200    if (rName != null) {
3201      // Able to resolve clientMachine mapping.
3202      // Create a temp node to findout the rack local nodes
3203      clientNode = new NodeBase(rName.get(0) + NodeBase.PATH_SEPARATOR_STR
3204          + clientMachine);
3205    }
3206    return clientNode;
3207  }
3208
3209  static class FileState {
3210    public final INodeFile inode;
3211    public final String path;
3212    public final INodesInPath iip;
3213
3214    public FileState(INodeFile inode, String fullPath, INodesInPath iip) {
3215      this.inode = inode;
3216      this.path = fullPath;
3217      this.iip = iip;
3218    }
3219  }
3220
3221  private FileState analyzeFileState(
3222      INodesInPath iip, long fileId, String clientName,
3223      ExtendedBlock previous, LocatedBlock[] onRetryBlock)
3224          throws IOException  {
3225    assert hasReadLock();
3226    String src = iip.getPath();
3227    checkBlock(previous);
3228    onRetryBlock[0] = null;
3229    checkNameNodeSafeMode("Cannot add block to " + src);
3230
3231    // have we exceeded the configured limit of fs objects.
3232    checkFsObjectLimit();
3233
3234    Block previousBlock = ExtendedBlock.getLocalBlock(previous);
3235    final INodeFile pendingFile = checkLease(iip, clientName, fileId);
3236    BlockInfoContiguous lastBlockInFile = pendingFile.getLastBlock();
3237    if (!Block.matchingIdAndGenStamp(previousBlock, lastBlockInFile)) {
3238      // The block that the client claims is the current last block
3239      // doesn't match up with what we think is the last block. There are
3240      // four possibilities:
3241      // 1) This is the first block allocation of an append() pipeline
3242      //    which started appending exactly at or exceeding the block boundary.
3243      //    In this case, the client isn't passed the previous block,
3244      //    so it makes the allocateBlock() call with previous=null.
3245      //    We can distinguish this since the last block of the file
3246      //    will be exactly a full block.
3247      // 2) This is a retry from a client that missed the response of a
3248      //    prior getAdditionalBlock() call, perhaps because of a network
3249      //    timeout, or because of an HA failover. In that case, we know
3250      //    by the fact that the client is re-issuing the RPC that it
3251      //    never began to write to the old block. Hence it is safe to
3252      //    to return the existing block.
3253      // 3) This is an entirely bogus request/bug -- we should error out
3254      //    rather than potentially appending a new block with an empty
3255      //    one in the middle, etc
3256      // 4) This is a retry from a client that timed out while
3257      //    the prior getAdditionalBlock() is still being processed,
3258      //    currently working on chooseTarget(). 
3259      //    There are no means to distinguish between the first and 
3260      //    the second attempts in Part I, because the first one hasn't
3261      //    changed the namesystem state yet.
3262      //    We run this analysis again in Part II where case 4 is impossible.
3263
3264      BlockInfoContiguous penultimateBlock = pendingFile.getPenultimateBlock();
3265      if (previous == null &&
3266          lastBlockInFile != null &&
3267          lastBlockInFile.getNumBytes() >= pendingFile.getPreferredBlockSize() &&
3268          lastBlockInFile.isComplete()) {
3269        // Case 1
3270        NameNode.stateChangeLog.debug(
3271            "BLOCK* NameSystem.allocateBlock: handling block allocation" +
3272            " writing to a file with a complete previous block: src={}" +
3273            " lastBlock={}", src, lastBlockInFile);
3274      } else if (Block.matchingIdAndGenStamp(penultimateBlock, previousBlock)) {
3275        if (lastBlockInFile.getNumBytes() != 0) {
3276          throw new IOException(
3277              "Request looked like a retry to allocate block " +
3278              lastBlockInFile + " but it already contains " +
3279              lastBlockInFile.getNumBytes() + " bytes");
3280        }
3281
3282        // Case 2
3283        // Return the last block.
3284        NameNode.stateChangeLog.info("BLOCK* allocateBlock: " +
3285            "caught retry for allocation of a new block in " +
3286            src + ". Returning previously allocated block " + lastBlockInFile);
3287        long offset = pendingFile.computeFileSize();
3288        onRetryBlock[0] = makeLocatedBlock(lastBlockInFile,
3289            ((BlockInfoContiguousUnderConstruction)lastBlockInFile).getExpectedStorageLocations(),
3290            offset);
3291        return new FileState(pendingFile, src, iip);
3292      } else {
3293        // Case 3
3294        throw new IOException("Cannot allocate block in " + src + ": " +
3295            "passed 'previous' block " + previous + " does not match actual " +
3296            "last block in file " + lastBlockInFile);
3297      }
3298    }
3299    return new FileState(pendingFile, src, iip);
3300  }
3301
3302  LocatedBlock makeLocatedBlock(Block blk, DatanodeStorageInfo[] locs,
3303                                        long offset) throws IOException {
3304    LocatedBlock lBlk = new LocatedBlock(
3305        getExtendedBlock(blk), locs, offset, false);
3306    getBlockManager().setBlockToken(
3307        lBlk, BlockTokenSecretManager.AccessMode.WRITE);
3308    return lBlk;
3309  }
3310
3311  /** @see ClientProtocol#getAdditionalDatanode */
3312  LocatedBlock getAdditionalDatanode(String src, long fileId,
3313      final ExtendedBlock blk, final DatanodeInfo[] existings,
3314      final String[] storageIDs,
3315      final Set<Node> excludes,
3316      final int numAdditionalNodes, final String clientName
3317      ) throws IOException {
3318    //check if the feature is enabled
3319    dtpReplaceDatanodeOnFailure.checkEnabled();
3320
3321    Node clientnode = null;
3322    String clientMachine;
3323    final long preferredblocksize;
3324    final byte storagePolicyID;
3325    final List<DatanodeStorageInfo> chosen;
3326    checkOperation(OperationCategory.READ);
3327    FSPermissionChecker pc = getPermissionChecker();
3328    readLock();
3329    try {
3330      checkOperation(OperationCategory.READ);
3331      //check safe mode
3332      checkNameNodeSafeMode("Cannot add datanode; src=" + src + ", blk=" + blk);
3333      final INodesInPath iip = dir.resolvePath(pc, src, fileId);
3334      src = iip.getPath();
3335
3336      //check lease
3337      final INodeFile file = checkLease(iip, clientName, fileId);
3338      clientMachine = file.getFileUnderConstructionFeature().getClientMachine();
3339      clientnode = blockManager.getDatanodeManager().getDatanodeByHost(clientMachine);
3340      preferredblocksize = file.getPreferredBlockSize();
3341      storagePolicyID = file.getStoragePolicyID();
3342
3343      //find datanode storages
3344      final DatanodeManager dm = blockManager.getDatanodeManager();
3345      chosen = Arrays.asList(dm.getDatanodeStorageInfos(existings, storageIDs,
3346          "src=%s, fileId=%d, blk=%s, clientName=%s, clientMachine=%s",
3347          src, fileId, blk, clientName, clientMachine));
3348    } finally {
3349      readUnlock("getAdditionalDatanode");
3350    }
3351
3352    if (clientnode == null) {
3353      clientnode = getClientNode(clientMachine);
3354    }
3355
3356    // choose new datanodes.
3357    final DatanodeStorageInfo[] targets = blockManager.chooseTarget4AdditionalDatanode(
3358        src, numAdditionalNodes, clientnode, chosen, 
3359        excludes, preferredblocksize, storagePolicyID);
3360    final LocatedBlock lb = new LocatedBlock(blk, targets);
3361    blockManager.setBlockToken(lb, AccessMode.COPY);
3362    return lb;
3363  }
3364
3365  /**
3366   * The client would like to let go of the given block
3367   */
3368  boolean abandonBlock(ExtendedBlock b, long fileId, String src, String holder)
3369      throws IOException {
3370    NameNode.stateChangeLog.debug(
3371        "BLOCK* NameSystem.abandonBlock: {} of file {}", b, src);
3372    checkOperation(OperationCategory.WRITE);
3373    FSPermissionChecker pc = getPermissionChecker();
3374    waitForLoadingFSImage();
3375    writeLock();
3376    final INodesInPath iip = dir.resolvePath(pc, src, fileId);
3377    src = iip.getPath();
3378    try {
3379      checkOperation(OperationCategory.WRITE);
3380      checkNameNodeSafeMode("Cannot abandon block " + b + " for file" + src);
3381      final INodeFile file = checkLease(iip, holder, fileId);
3382
3383      // Remove the block from the pending creates list
3384      boolean removed = dir.removeBlock(src, iip, file,
3385          ExtendedBlock.getLocalBlock(b));
3386      if (!removed) {
3387        return true;
3388      }
3389      NameNode.stateChangeLog.debug("BLOCK* NameSystem.abandonBlock: {} is " +
3390          "removed from pendingCreates", b);
3391      persistBlocks(src, file, false);
3392    } finally {
3393      writeUnlock("abandonBlock");
3394    }
3395    getEditLog().logSync();
3396
3397    return true;
3398  }
3399
3400  private INodeFile checkLease(INodesInPath iip, String holder, long fileId)
3401      throws LeaseExpiredException, FileNotFoundException {
3402    String src = iip.getPath();
3403    INode inode = iip.getLastINode();
3404    assert hasReadLock();
3405    final String ident = src + " (inode " + fileId + ")";
3406    if (inode == null) {
3407      Lease lease = leaseManager.getLease(holder);
3408      throw new LeaseExpiredException(
3409          "No lease on " + ident + ": File does not exist. "
3410          + (lease != null ? lease.toString()
3411              : "Holder " + holder + " does not have any open files."));
3412    }
3413    if (!inode.isFile()) {
3414      Lease lease = leaseManager.getLease(holder);
3415      throw new LeaseExpiredException(
3416          "No lease on " + ident + ": INode is not a regular file. "
3417              + (lease != null ? lease.toString()
3418              : "Holder " + holder + " does not have any open files."));
3419    }
3420    final INodeFile file = inode.asFile();
3421    if (!file.isUnderConstruction()) {
3422      Lease lease = leaseManager.getLease(holder);
3423      throw new LeaseExpiredException(
3424          "No lease on " + ident + ": File is not open for writing. "
3425          + (lease != null ? lease.toString()
3426              : "Holder " + holder + " does not have any open files."));
3427    }
3428    // No further modification is allowed on a deleted file.
3429    // A file is considered deleted, if it is not in the inodeMap or is marked
3430    // as deleted in the snapshot feature.
3431    if (isFileDeleted(file)) {
3432      throw new FileNotFoundException(src);
3433    }
3434    String clientName = file.getFileUnderConstructionFeature().getClientName();
3435    if (holder != null && !clientName.equals(holder)) {
3436      throw new LeaseExpiredException("Lease mismatch on " + ident +
3437          " owned by " + clientName + " but is accessed by " + holder);
3438    }
3439    return file;
3440  }
3441 
3442  /**
3443   * Complete in-progress write to the given file.
3444   * @return true if successful, false if the client should continue to retry
3445   *         (e.g if not all blocks have reached minimum replication yet)
3446   * @throws IOException on error (eg lease mismatch, file not open, file deleted)
3447   */
3448  boolean completeFile(final String srcArg, String holder,
3449                       ExtendedBlock last, long fileId)
3450    throws SafeModeException, UnresolvedLinkException, IOException {
3451    String src = srcArg;
3452    NameNode.stateChangeLog.debug("DIR* NameSystem.completeFile: {} for {}",
3453        src, holder);
3454    checkBlock(last);
3455    boolean success = false;
3456    checkOperation(OperationCategory.WRITE);
3457    waitForLoadingFSImage();
3458    writeLock();
3459    try {
3460      checkOperation(OperationCategory.WRITE);
3461      checkNameNodeSafeMode("Cannot complete file " + src);
3462      success = completeFileInternal(src, holder,
3463        ExtendedBlock.getLocalBlock(last), fileId);
3464    } finally {
3465      writeUnlock("completeFile");
3466    }
3467    getEditLog().logSync();
3468    if (success) {
3469      NameNode.stateChangeLog.info("DIR* completeFile: " + srcArg
3470          + " is closed by " + holder);
3471    }
3472    return success;
3473  }
3474
3475  private boolean completeFileInternal(String src, String holder, Block last,
3476      long fileId) throws IOException {
3477    assert hasWriteLock();
3478    final INodeFile pendingFile;
3479    FSPermissionChecker pc = getPermissionChecker();
3480    final INodesInPath iip = dir.resolvePath(pc, src, fileId);
3481    src = iip.getPath();
3482    INode inode = null;
3483    try {
3484      inode = iip.getLastINode();
3485      pendingFile = checkLease(iip, holder, fileId);
3486    } catch (LeaseExpiredException lee) {
3487      if (inode != null && inode.isFile() &&
3488          !inode.asFile().isUnderConstruction()) {
3489        // This could be a retry RPC - i.e the client tried to close
3490        // the file, but missed the RPC response. Thus, it is trying
3491        // again to close the file. If the file still exists and
3492        // the client's view of the last block matches the actual
3493        // last block, then we'll treat it as a successful close.
3494        // See HDFS-3031.
3495        final Block realLastBlock = inode.asFile().getLastBlock();
3496        if (Block.matchingIdAndGenStamp(last, realLastBlock)) {
3497          NameNode.stateChangeLog.info("DIR* completeFile: " +
3498              "request from " + holder + " to complete inode " + fileId +
3499              "(" + src + ") which is already closed. But, it appears to be " +
3500              "an RPC retry. Returning success");
3501          return true;
3502        }
3503      }
3504      throw lee;
3505    }
3506    // Check the state of the penultimate block. It should be completed
3507    // before attempting to complete the last one.
3508    if (!checkFileProgress(src, pendingFile, false)) {
3509      return false;
3510    }
3511
3512    // commit the last block and complete it if it has minimum replicas
3513    commitOrCompleteLastBlock(pendingFile, iip, last);
3514
3515    if (!checkFileProgress(src, pendingFile, true)) {
3516      return false;
3517    }
3518
3519    finalizeINodeFileUnderConstruction(src, pendingFile,
3520        Snapshot.CURRENT_STATE_ID);
3521    return true;
3522  }
3523
3524  /**
3525   * Save allocated block at the given pending filename
3526   * 
3527   * @param src path to the file
3528   * @param inodesInPath representing each of the components of src.
3529   *                     The last INode is the INode for {@code src} file.
3530   * @param newBlock newly allocated block to be save
3531   * @param targets target datanodes where replicas of the new block is placed
3532   * @throws QuotaExceededException If addition of block exceeds space quota
3533   */
3534  BlockInfoContiguous saveAllocatedBlock(String src, INodesInPath inodesInPath,
3535      Block newBlock, DatanodeStorageInfo[] targets)
3536          throws IOException {
3537    assert hasWriteLock();
3538    BlockInfoContiguous b = dir.addBlock(src, inodesInPath, newBlock, targets);
3539    NameNode.stateChangeLog.info("BLOCK* allocate " + b + " for " + src);
3540    DatanodeStorageInfo.incrementBlocksScheduled(targets);
3541    return b;
3542  }
3543
3544  /**
3545   * Create new block with a unique block id and a new generation stamp.
3546   */
3547  Block createNewBlock() throws IOException {
3548    assert hasWriteLock();
3549    Block b = new Block(nextBlockId(), 0, 0);
3550    // Increment the generation stamp for every new block.
3551    b.setGenerationStamp(nextGenerationStamp(false));
3552    return b;
3553  }
3554
3555  /**
3556   * Check that the indicated file's blocks are present and
3557   * replicated.  If not, return false. If checkall is true, then check
3558   * all blocks, otherwise check only penultimate block.
3559   */
3560  boolean checkFileProgress(String src, INodeFile v, boolean checkall) {
3561    if (checkall) {
3562      // check all blocks of the file.
3563      for (BlockInfoContiguous block: v.getBlocks()) {
3564        if (!isCompleteBlock(src, block, blockManager.minReplication)) {
3565          return false;
3566        }
3567      }
3568    } else {
3569      // check the penultimate block of this file
3570      BlockInfoContiguous b = v.getPenultimateBlock();
3571      if (b != null
3572          && !isCompleteBlock(src, b, blockManager.minReplication)) {
3573        return false;
3574      }
3575    }
3576    return true;
3577  }
3578
3579  private static boolean isCompleteBlock(String src, BlockInfoContiguous b, int minRepl) {
3580    if (!b.isComplete()) {
3581      final BlockInfoContiguousUnderConstruction uc = (BlockInfoContiguousUnderConstruction)b;
3582      final int numNodes = b.numNodes();
3583      LOG.info("BLOCK* " + b + " is not COMPLETE (ucState = "
3584          + uc.getBlockUCState() + ", replication# = " + numNodes
3585          + (numNodes < minRepl? " < ": " >= ")
3586          + " minimum = " + minRepl + ") in file " + src);
3587      return false;
3588    }
3589    return true;
3590  }
3591
3592  ////////////////////////////////////////////////////////////////
3593  // Here's how to handle block-copy failure during client write:
3594  // -- As usual, the client's write should result in a streaming
3595  // backup write to a k-machine sequence.
3596  // -- If one of the backup machines fails, no worries.  Fail silently.
3597  // -- Before client is allowed to close and finalize file, make sure
3598  // that the blocks are backed up.  Namenode may have to issue specific backup
3599  // commands to make up for earlier datanode failures.  Once all copies
3600  // are made, edit namespace and return to client.
3601  ////////////////////////////////////////////////////////////////
3602
3603  /** 
3604   * Change the indicated filename. 
3605   * @deprecated Use {@link #renameTo(String, String, boolean,
3606   * Options.Rename...)} instead.
3607   */
3608  @Deprecated
3609  boolean renameTo(String src, String dst, boolean logRetryCache)
3610      throws IOException {
3611    final String operationName = "rename";
3612    waitForLoadingFSImage();
3613    FSDirRenameOp.RenameOldResult ret = null;
3614    writeLock();
3615    try {
3616      checkOperation(OperationCategory.WRITE);
3617      checkNameNodeSafeMode("Cannot rename " + src);
3618      ret = FSDirRenameOp.renameToInt(dir, src, dst, logRetryCache);
3619    } catch (AccessControlException e)  {
3620      logAuditEvent(false, operationName, src, dst, null);
3621      throw e;
3622    } finally {
3623      writeUnlock(operationName);
3624    }
3625    boolean success = ret != null && ret.success;
3626    if (success) {
3627      getEditLog().logSync();
3628    }
3629    logAuditEvent(success, "rename", src, dst,
3630        ret == null ? null : ret.auditStat);
3631    return success;
3632  }
3633
3634  void renameTo(final String src, final String dst,
3635                boolean logRetryCache, Options.Rename... options)
3636      throws IOException {
3637    final String operationName = "rename";
3638    waitForLoadingFSImage();
3639    Map.Entry<BlocksMapUpdateInfo, HdfsFileStatus> res = null;
3640    writeLock();
3641    try {
3642      checkOperation(OperationCategory.WRITE);
3643      checkNameNodeSafeMode("Cannot rename " + src);
3644      res = FSDirRenameOp.renameToInt(dir, src, dst, logRetryCache, options);
3645    } catch (AccessControlException e) {
3646      logAuditEvent(false, operationName + " (options=" +
3647          Arrays.toString(options) + ")", src, dst, null);
3648      throw e;
3649    } finally {
3650      writeUnlock(operationName);
3651    }
3652
3653    getEditLog().logSync();
3654
3655    BlocksMapUpdateInfo collectedBlocks = res.getKey();
3656    HdfsFileStatus auditStat = res.getValue();
3657    if (!collectedBlocks.getToDeleteList().isEmpty()) {
3658      removeBlocks(collectedBlocks);
3659      collectedBlocks.clear();
3660    }
3661
3662    logAuditEvent(true, operationName + " (options=" +
3663        Arrays.toString(options) + ")", src, dst, auditStat);
3664  }
3665
3666  /**
3667   * Remove the indicated file from namespace.
3668   * 
3669   * @see ClientProtocol#delete(String, boolean) for detailed description and 
3670   * description of exceptions
3671   */
3672  boolean delete(String src, boolean recursive, boolean logRetryCache)
3673      throws IOException {
3674    waitForLoadingFSImage();
3675    final String operationName = "delete";
3676    BlocksMapUpdateInfo toRemovedBlocks = null;
3677    writeLock();
3678    boolean ret = false;
3679    try {
3680      checkOperation(OperationCategory.WRITE);
3681      checkNameNodeSafeMode("Cannot delete " + src);
3682      toRemovedBlocks = FSDirDeleteOp.delete(
3683          this, src, recursive, logRetryCache);
3684      ret = toRemovedBlocks != null;
3685    } catch (AccessControlException e) {
3686      logAuditEvent(false, operationName, src);
3687      throw e;
3688    } finally {
3689      writeUnlock(operationName);
3690    }
3691    getEditLog().logSync();
3692    if (toRemovedBlocks != null) {
3693      removeBlocks(toRemovedBlocks); // Incremental deletion of blocks
3694    }
3695    logAuditEvent(true, operationName, src);
3696    return ret;
3697  }
3698
3699  FSPermissionChecker getPermissionChecker()
3700      throws AccessControlException {
3701    return dir.getPermissionChecker();
3702  }
3703
3704  /**
3705   * From the given list, incrementally remove the blocks from blockManager
3706   * Writelock is dropped and reacquired every BLOCK_DELETION_INCREMENT to
3707   * ensure that other waiters on the lock can get in. See HDFS-2938
3708   * 
3709   * @param blocks
3710   *          An instance of {@link BlocksMapUpdateInfo} which contains a list
3711   *          of blocks that need to be removed from blocksMap
3712   */
3713  void removeBlocks(BlocksMapUpdateInfo blocks) {
3714    List<Block> toDeleteList = blocks.getToDeleteList();
3715    Iterator<Block> iter = toDeleteList.iterator();
3716    while (iter.hasNext()) {
3717      writeLock();
3718      try {
3719        for (int i = 0; i < BLOCK_DELETION_INCREMENT && iter.hasNext(); i++) {
3720          blockManager.removeBlock(iter.next());
3721        }
3722      } finally {
3723        writeUnlock("removeBlocks");
3724      }
3725    }
3726  }
3727  
3728  /**
3729   * Remove leases and inodes related to a given path
3730   * @param src The given path
3731   * @param removedINodes Containing the list of inodes to be removed from
3732   *                      inodesMap
3733   * @param acquireINodeMapLock Whether to acquire the lock for inode removal
3734   */
3735  void removeLeasesAndINodes(String src, List<INode> removedINodes,
3736      final boolean acquireINodeMapLock) {
3737    assert hasWriteLock();
3738    leaseManager.removeLeaseWithPrefixPath(src);
3739    // remove inodes from inodesMap
3740    if (removedINodes != null) {
3741      if (acquireINodeMapLock) {
3742        dir.writeLock();
3743      }
3744      try {
3745        dir.removeFromInodeMap(removedINodes);
3746      } finally {
3747        if (acquireINodeMapLock) {
3748          dir.writeUnlock();
3749        }
3750      }
3751      removedINodes.clear();
3752    }
3753  }
3754
3755  /**
3756   * Removes the blocks from blocksmap and updates the safemode blocks total
3757   * 
3758   * @param blocks
3759   *          An instance of {@link BlocksMapUpdateInfo} which contains a list
3760   *          of blocks that need to be removed from blocksMap
3761   */
3762  void removeBlocksAndUpdateSafemodeTotal(BlocksMapUpdateInfo blocks) {
3763    assert hasWriteLock();
3764    // In the case that we are a Standby tailing edits from the
3765    // active while in safe-mode, we need to track the total number
3766    // of blocks and safe blocks in the system.
3767    boolean trackBlockCounts = isSafeModeTrackingBlocks();
3768    int numRemovedComplete = 0, numRemovedSafe = 0;
3769
3770    for (Block b : blocks.getToDeleteList()) {
3771      if (trackBlockCounts) {
3772        BlockInfoContiguous bi = getStoredBlock(b);
3773        if (bi.isComplete()) {
3774          numRemovedComplete++;
3775          if (bi.numNodes() >= blockManager.minReplication) {
3776            numRemovedSafe++;
3777          }
3778        }
3779      }
3780      blockManager.removeBlock(b);
3781    }
3782    if (trackBlockCounts) {
3783      if (LOG.isDebugEnabled()) {
3784        LOG.debug("Adjusting safe-mode totals for deletion."
3785            + "decreasing safeBlocks by " + numRemovedSafe
3786            + ", totalBlocks by " + numRemovedComplete);
3787      }
3788      adjustSafeModeBlockTotals(-numRemovedSafe, -numRemovedComplete);
3789    }
3790  }
3791
3792  /**
3793   * @see SafeModeInfo#shouldIncrementallyTrackBlocks
3794   */
3795  private boolean isSafeModeTrackingBlocks() {
3796    if (!haEnabled) {
3797      // Never track blocks incrementally in non-HA code.
3798      return false;
3799    }
3800    SafeModeInfo sm = this.safeMode;
3801    return sm != null && sm.shouldIncrementallyTrackBlocks();
3802  }
3803
3804  /**
3805   * Get the file info for a specific file.
3806   *
3807   * @param src The string representation of the path to the file
3808   * @param resolveLink whether to throw UnresolvedLinkException
3809   *        if src refers to a symlink
3810   *
3811   * @throws AccessControlException if access is denied
3812   * @throws UnresolvedLinkException if a symlink is encountered.
3813   *
3814   * @return object containing information regarding the file
3815   *         or null if file not found
3816   * @throws StandbyException
3817   */
3818  HdfsFileStatus getFileInfo(final String src, boolean resolveLink)
3819    throws IOException {
3820    final String operationName = "getfileinfo";
3821    checkOperation(OperationCategory.READ);
3822    HdfsFileStatus stat = null;
3823    readLock();
3824    try {
3825      checkOperation(OperationCategory.READ);
3826      stat = FSDirStatAndListingOp.getFileInfo(dir, src, resolveLink);
3827    } catch (AccessControlException e) {
3828      logAuditEvent(false, operationName, src);
3829      throw e;
3830    } finally {
3831      readUnlock(operationName);
3832    }
3833    logAuditEvent(true, operationName, src);
3834    return stat;
3835  }
3836
3837  /**
3838   * Returns true if the file is closed
3839   */
3840  boolean isFileClosed(final String src) throws IOException {
3841    final String operationName = "isFileClosed";
3842    checkOperation(OperationCategory.READ);
3843    readLock();
3844    try {
3845      checkOperation(OperationCategory.READ);
3846      return FSDirStatAndListingOp.isFileClosed(dir, src);
3847    } catch (AccessControlException e) {
3848      logAuditEvent(false, operationName, src);
3849      throw e;
3850    } finally {
3851      readUnlock(operationName);
3852    }
3853  }
3854
3855  /**
3856   * Create all the necessary directories
3857   */
3858  boolean mkdirs(String src, PermissionStatus permissions,
3859      boolean createParent) throws IOException {
3860    final String operationName = "mkdirs";
3861    HdfsFileStatus auditStat = null;
3862    checkOperation(OperationCategory.WRITE);
3863    writeLock();
3864    try {
3865      checkOperation(OperationCategory.WRITE);
3866      checkNameNodeSafeMode("Cannot create directory " + src);
3867      auditStat = FSDirMkdirOp.mkdirs(this, src, permissions, createParent);
3868    } catch (AccessControlException e) {
3869      logAuditEvent(false, operationName, src);
3870      throw e;
3871    } finally {
3872      writeUnlock(operationName);
3873    }
3874    getEditLog().logSync();
3875    logAuditEvent(true, operationName, src, null, auditStat);
3876    return true;
3877  }
3878
3879  /**
3880   * Get the content summary for a specific file/dir.
3881   *
3882   * @param src The string representation of the path to the file
3883   *
3884   * @throws AccessControlException if access is denied
3885   * @throws UnresolvedLinkException if a symlink is encountered.
3886   * @throws FileNotFoundException if no file exists
3887   * @throws StandbyException
3888   * @throws IOException for issues with writing to the audit log
3889   *
3890   * @return object containing information regarding the file
3891   *         or null if file not found
3892   */
3893  ContentSummary getContentSummary(final String src) throws IOException {
3894    checkOperation(OperationCategory.READ);
3895    final String operationName = "contentSummary";
3896    readLock();
3897    boolean success = true;
3898    try {
3899      checkOperation(OperationCategory.READ);
3900      return FSDirStatAndListingOp.getContentSummary(dir, src);
3901    } catch (AccessControlException ace) {
3902      success = false;
3903      throw ace;
3904    } finally {
3905      readUnlock(operationName);
3906      logAuditEvent(success, operationName, src);
3907    }
3908  }
3909
3910  /**
3911   * Set the namespace quota and storage space quota for a directory.
3912   * See {@link ClientProtocol#setQuota(String, long, long, StorageType)} for the
3913   * contract.
3914   * 
3915   * Note: This does not support ".inodes" relative path.
3916   */
3917  void setQuota(String src, long nsQuota, long ssQuota, StorageType type)
3918      throws IOException {
3919    checkOperation(OperationCategory.WRITE);
3920    final String operationName = "setQuota";
3921    writeLock();
3922    boolean success = false;
3923    try {
3924      checkOperation(OperationCategory.WRITE);
3925      checkNameNodeSafeMode("Cannot set quota on " + src);
3926      FSDirAttrOp.setQuota(dir, src, nsQuota, ssQuota, type);
3927      success = true;
3928    } finally {
3929      writeUnlock(operationName);
3930      if (success) {
3931        getEditLog().logSync();
3932      }
3933      logAuditEvent(success, operationName, src);
3934    }
3935  }
3936
3937  /** Persist all metadata about this file.
3938   * @param src The string representation of the path
3939   * @param fileId The inode ID that we're fsyncing.  Older clients will pass
3940   *               INodeId.GRANDFATHER_INODE_ID here.
3941   * @param clientName The string representation of the client
3942   * @param lastBlockLength The length of the last block 
3943   *                        under construction reported from client.
3944   * @throws IOException if path does not exist
3945   */
3946  void fsync(String src, long fileId, String clientName, long lastBlockLength)
3947      throws IOException {
3948    NameNode.stateChangeLog.info("BLOCK* fsync: " + src + " for " + clientName);
3949    checkOperation(OperationCategory.WRITE);
3950
3951    FSPermissionChecker pc = getPermissionChecker();
3952    waitForLoadingFSImage();
3953    writeLock();
3954    try {
3955      checkOperation(OperationCategory.WRITE);
3956      checkNameNodeSafeMode("Cannot fsync file " + src);
3957      INodesInPath iip = dir.resolvePath(pc, src, fileId);
3958      src = iip.getPath();
3959      final INodeFile pendingFile = checkLease(iip, clientName, fileId);
3960      if (lastBlockLength > 0) {
3961        pendingFile.getFileUnderConstructionFeature().updateLengthOfLastBlock(
3962            pendingFile, lastBlockLength);
3963      }
3964      persistBlocks(src, pendingFile, false);
3965    } finally {
3966      writeUnlock("fsync");
3967    }
3968    getEditLog().logSync();
3969  }
3970
3971  /**
3972   * Move a file that is being written to be immutable.
3973   * @param src The filename
3974   * @param lease The lease for the client creating the file
3975   * @param recoveryLeaseHolder reassign lease to this holder if the last block
3976   *        needs recovery; keep current holder if null.
3977   * @throws AlreadyBeingCreatedException if file is waiting to achieve minimal
3978   *         replication;<br>
3979   *         RecoveryInProgressException if lease recovery is in progress.<br>
3980   *         IOException in case of an error.
3981   * @return true  if file has been successfully finalized and closed or 
3982   *         false if block recovery has been initiated. Since the lease owner
3983   *         has been changed and logged, caller should call logSync().
3984   */
3985  boolean internalReleaseLease(Lease lease, String src, INodesInPath iip,
3986      String recoveryLeaseHolder) throws IOException {
3987    LOG.info("Recovering " + lease + ", src=" + src);
3988    assert !isInSafeMode();
3989    assert hasWriteLock();
3990
3991    final INodeFile pendingFile = iip.getLastINode().asFile();
3992    int nrBlocks = pendingFile.numBlocks();
3993    BlockInfoContiguous[] blocks = pendingFile.getBlocks();
3994
3995    int nrCompleteBlocks;
3996    BlockInfoContiguous curBlock = null;
3997    for(nrCompleteBlocks = 0; nrCompleteBlocks < nrBlocks; nrCompleteBlocks++) {
3998      curBlock = blocks[nrCompleteBlocks];
3999      if(!curBlock.isComplete())
4000        break;
4001      assert blockManager.checkMinReplication(curBlock) :
4002              "A COMPLETE block is not minimally replicated in " + src;
4003    }
4004
4005    // If there are no incomplete blocks associated with this file,
4006    // then reap lease immediately and close the file.
4007    if(nrCompleteBlocks == nrBlocks) {
4008      finalizeINodeFileUnderConstruction(src, pendingFile,
4009          iip.getLatestSnapshotId());
4010      NameNode.stateChangeLog.warn("BLOCK*"
4011        + " internalReleaseLease: All existing blocks are COMPLETE,"
4012        + " lease removed, file closed.");
4013      return true;  // closed!
4014    }
4015
4016    // Only the last and the penultimate blocks may be in non COMPLETE state.
4017    // If the penultimate block is not COMPLETE, then it must be COMMITTED.
4018    if(nrCompleteBlocks < nrBlocks - 2 ||
4019       nrCompleteBlocks == nrBlocks - 2 &&
4020         curBlock != null &&
4021         curBlock.getBlockUCState() != BlockUCState.COMMITTED) {
4022      final String message = "DIR* NameSystem.internalReleaseLease: "
4023        + "attempt to release a create lock on "
4024        + src + " but file is already closed.";
4025      NameNode.stateChangeLog.warn(message);
4026      throw new IOException(message);
4027    }
4028
4029    // The last block is not COMPLETE, and
4030    // that the penultimate block if exists is either COMPLETE or COMMITTED
4031    final BlockInfoContiguous lastBlock = pendingFile.getLastBlock();
4032    BlockUCState lastBlockState = lastBlock.getBlockUCState();
4033    BlockInfoContiguous penultimateBlock = pendingFile.getPenultimateBlock();
4034
4035    // If penultimate block doesn't exist then its minReplication is met
4036    boolean penultimateBlockMinReplication = penultimateBlock == null ? true :
4037        blockManager.checkMinReplication(penultimateBlock);
4038
4039    switch(lastBlockState) {
4040    case COMPLETE:
4041      assert false : "Already checked that the last block is incomplete";
4042      break;
4043    case COMMITTED:
4044      // Close file if committed blocks are minimally replicated
4045      if(penultimateBlockMinReplication &&
4046          blockManager.checkMinReplication(lastBlock)) {
4047        finalizeINodeFileUnderConstruction(src, pendingFile,
4048            iip.getLatestSnapshotId());
4049        NameNode.stateChangeLog.warn("BLOCK*"
4050          + " internalReleaseLease: Committed blocks are minimally replicated,"
4051          + " lease removed, file closed.");
4052        return true;  // closed!
4053      }
4054      // Cannot close file right now, since some blocks 
4055      // are not yet minimally replicated.
4056      // This may potentially cause infinite loop in lease recovery
4057      // if there are no valid replicas on data-nodes.
4058      String message = "DIR* NameSystem.internalReleaseLease: " +
4059          "Failed to release lease for file " + src +
4060          ". Committed blocks are waiting to be minimally replicated." +
4061          " Try again later.";
4062      NameNode.stateChangeLog.warn(message);
4063      throw new AlreadyBeingCreatedException(message);
4064    case UNDER_CONSTRUCTION:
4065    case UNDER_RECOVERY:
4066      final BlockInfoContiguousUnderConstruction uc = (BlockInfoContiguousUnderConstruction)lastBlock;
4067      // determine if last block was intended to be truncated
4068      Block recoveryBlock = uc.getTruncateBlock();
4069      boolean truncateRecovery = recoveryBlock != null;
4070      boolean copyOnTruncate = truncateRecovery &&
4071          recoveryBlock.getBlockId() != uc.getBlockId();
4072      assert !copyOnTruncate ||
4073          recoveryBlock.getBlockId() < uc.getBlockId() &&
4074          recoveryBlock.getGenerationStamp() < uc.getGenerationStamp() &&
4075          recoveryBlock.getNumBytes() > uc.getNumBytes() :
4076            "wrong recoveryBlock";
4077
4078      // setup the last block locations from the blockManager if not known
4079      if (uc.getNumExpectedLocations() == 0) {
4080        uc.setExpectedLocations(blockManager.getStorages(lastBlock));
4081      }
4082
4083      if (uc.getNumExpectedLocations() == 0 && uc.getNumBytes() == 0) {
4084        // There is no datanode reported to this block.
4085        // may be client have crashed before writing data to pipeline.
4086        // This blocks doesn't need any recovery.
4087        // We can remove this block and close the file.
4088        pendingFile.removeLastBlock(lastBlock);
4089        finalizeINodeFileUnderConstruction(src, pendingFile,
4090            iip.getLatestSnapshotId());
4091        NameNode.stateChangeLog.warn("BLOCK* internalReleaseLease: "
4092            + "Removed empty last block and closed file.");
4093        return true;
4094      }
4095      // start recovery of the last block for this file
4096      long blockRecoveryId = nextGenerationStamp(blockIdManager.isLegacyBlock(uc));
4097      lease = reassignLease(lease, src, recoveryLeaseHolder, pendingFile);
4098      if(copyOnTruncate) {
4099        uc.setGenerationStamp(blockRecoveryId);
4100      } else if(truncateRecovery) {
4101        recoveryBlock.setGenerationStamp(blockRecoveryId);
4102      }
4103      uc.initializeBlockRecovery(blockRecoveryId);
4104      leaseManager.renewLease(lease);
4105      // Cannot close file right now, since the last block requires recovery.
4106      // This may potentially cause infinite loop in lease recovery
4107      // if there are no valid replicas on data-nodes.
4108      NameNode.stateChangeLog.warn(
4109                "DIR* NameSystem.internalReleaseLease: " +
4110                "File " + src + " has not been closed." +
4111               " Lease recovery is in progress. " +
4112                "RecoveryId = " + blockRecoveryId + " for block " + lastBlock);
4113      break;
4114    }
4115    return false;
4116  }
4117
4118  private Lease reassignLease(Lease lease, String src, String newHolder,
4119      INodeFile pendingFile) {
4120    assert hasWriteLock();
4121    if(newHolder == null)
4122      return lease;
4123    // The following transaction is not synced. Make sure it's sync'ed later.
4124    logReassignLease(lease.getHolder(), src, newHolder);
4125    return reassignLeaseInternal(lease, src, newHolder, pendingFile);
4126  }
4127  
4128  Lease reassignLeaseInternal(Lease lease, String src, String newHolder,
4129      INodeFile pendingFile) {
4130    assert hasWriteLock();
4131    pendingFile.getFileUnderConstructionFeature().setClientName(newHolder);
4132    return leaseManager.reassignLease(lease, src, newHolder);
4133  }
4134
4135  private void commitOrCompleteLastBlock(final INodeFile fileINode,
4136      final INodesInPath iip, final Block commitBlock) throws IOException {
4137    assert hasWriteLock();
4138    Preconditions.checkArgument(fileINode.isUnderConstruction());
4139    blockManager.commitOrCompleteLastBlock(fileINode, commitBlock, iip);
4140  }
4141
4142  private void finalizeINodeFileUnderConstruction(String src,
4143      INodeFile pendingFile, int latestSnapshot) throws IOException {
4144    assert hasWriteLock();
4145
4146    FileUnderConstructionFeature uc = pendingFile.getFileUnderConstructionFeature();
4147    if (uc == null) {
4148      throw new IOException("Cannot finalize file " + src
4149          + " because it is not under construction");
4150    }
4151    
4152    pendingFile.recordModification(latestSnapshot);
4153
4154    // The file is no longer pending.
4155    // Create permanent INode, update blocks. No need to replace the inode here
4156    // since we just remove the uc feature from pendingFile
4157    pendingFile.toCompleteFile(now());
4158
4159    leaseManager.removeLease(uc.getClientName(), src);
4160
4161    waitForLoadingFSImage();
4162    // close file and persist block allocations for this file
4163    closeFile(src, pendingFile);
4164
4165    blockManager.checkReplication(pendingFile);
4166  }
4167
4168  @VisibleForTesting
4169  BlockInfoContiguous getStoredBlock(Block block) {
4170    return blockManager.getStoredBlock(block);
4171  }
4172  
4173  @Override
4174  public boolean isInSnapshot(BlockInfoContiguousUnderConstruction blockUC) {
4175    assert hasReadLock();
4176    final BlockCollection bc = blockUC.getBlockCollection();
4177    if (bc == null || !(bc instanceof INodeFile)
4178        || !bc.isUnderConstruction()) {
4179      return false;
4180    }
4181
4182    String fullName = bc.getName();
4183    try {
4184      if (fullName != null && fullName.startsWith(Path.SEPARATOR)
4185          && dir.getINode(fullName) == bc) {
4186        // If file exists in normal path then no need to look in snapshot
4187        return false;
4188      }
4189    } catch (UnresolvedLinkException e) {
4190      LOG.error("Error while resolving the link : " + fullName, e);
4191      return false;
4192    }
4193    /*
4194     * 1. if bc is under construction and also with snapshot, and
4195     * bc is not in the current fsdirectory tree, bc must represent a snapshot
4196     * file. 
4197     * 2. if fullName is not an absolute path, bc cannot be existent in the 
4198     * current fsdirectory tree. 
4199     * 3. if bc is not the current node associated with fullName, bc must be a
4200     * snapshot inode.
4201     */
4202    return true;
4203  }
4204
4205  void commitBlockSynchronization(ExtendedBlock oldBlock,
4206      long newgenerationstamp, long newlength,
4207      boolean closeFile, boolean deleteblock, DatanodeID[] newtargets,
4208      String[] newtargetstorages) throws IOException {
4209    LOG.info("commitBlockSynchronization(oldBlock=" + oldBlock
4210             + ", newgenerationstamp=" + newgenerationstamp
4211             + ", newlength=" + newlength
4212             + ", newtargets=" + Arrays.asList(newtargets)
4213             + ", closeFile=" + closeFile
4214             + ", deleteBlock=" + deleteblock
4215             + ")");
4216    checkOperation(OperationCategory.WRITE);
4217    final String src;
4218    waitForLoadingFSImage();
4219    writeLock();
4220    try {
4221      checkOperation(OperationCategory.WRITE);
4222      // If a DN tries to commit to the standby, the recovery will
4223      // fail, and the next retry will succeed on the new NN.
4224  
4225      checkNameNodeSafeMode(
4226          "Cannot commitBlockSynchronization while in safe mode");
4227      final BlockInfoContiguous storedBlock = getStoredBlock(
4228          ExtendedBlock.getLocalBlock(oldBlock));
4229      if (storedBlock == null) {
4230        if (deleteblock) {
4231          // This may be a retry attempt so ignore the failure
4232          // to locate the block.
4233          if (LOG.isDebugEnabled()) {
4234            LOG.debug("Block (=" + oldBlock + ") not found");
4235          }
4236          return;
4237        } else {
4238          throw new IOException("Block (=" + oldBlock + ") not found");
4239        }
4240      }
4241      final long oldGenerationStamp = storedBlock.getGenerationStamp();
4242      final long oldNumBytes = storedBlock.getNumBytes();
4243      //
4244      // The implementation of delete operation (see @deleteInternal method)
4245      // first removes the file paths from namespace, and delays the removal
4246      // of blocks to later time for better performance. When
4247      // commitBlockSynchronization (this method) is called in between, the
4248      // blockCollection of storedBlock could have been assigned to null by
4249      // the delete operation, throw IOException here instead of NPE; if the
4250      // file path is already removed from namespace by the delete operation,
4251      // throw FileNotFoundException here, so not to proceed to the end of
4252      // this method to add a CloseOp to the edit log for an already deleted
4253      // file (See HDFS-6825).
4254      //
4255      BlockCollection blockCollection = storedBlock.getBlockCollection();
4256      if (blockCollection == null) {
4257        throw new IOException("The blockCollection of " + storedBlock
4258            + " is null, likely because the file owning this block was"
4259            + " deleted and the block removal is delayed");
4260      }
4261      INodeFile iFile = ((INode)blockCollection).asFile();
4262      src = iFile.getFullPathName();
4263      if (isFileDeleted(iFile)) {
4264        throw new FileNotFoundException("File not found: "
4265            + src + ", likely due to delayed block removal");
4266      }
4267      if ((!iFile.isUnderConstruction() || storedBlock.isComplete()) &&
4268          iFile.getLastBlock().isComplete()) {
4269        if (LOG.isDebugEnabled()) {
4270          LOG.debug("Unexpected block (=" + oldBlock
4271                    + ") since the file (=" + iFile.getLocalName()
4272                    + ") is not under construction");
4273        }
4274        return;
4275      }
4276
4277      BlockInfoContiguousUnderConstruction truncatedBlock =
4278          (BlockInfoContiguousUnderConstruction) iFile.getLastBlock();
4279      long recoveryId = truncatedBlock.getBlockRecoveryId();
4280      boolean copyTruncate =
4281          truncatedBlock.getBlockId() != storedBlock.getBlockId();
4282      if(recoveryId != newgenerationstamp) {
4283        throw new IOException("The recovery id " + newgenerationstamp
4284                              + " does not match current recovery id "
4285                              + recoveryId + " for block " + oldBlock);
4286      }
4287
4288      if (deleteblock) {
4289        Block blockToDel = ExtendedBlock.getLocalBlock(oldBlock);
4290        boolean remove = iFile.removeLastBlock(blockToDel);
4291        if (remove) {
4292          blockManager.removeBlock(storedBlock);
4293        }
4294      }
4295      else {
4296        // update last block
4297        if(!copyTruncate) {
4298          storedBlock.setGenerationStamp(newgenerationstamp);
4299          storedBlock.setNumBytes(newlength);
4300        }
4301
4302        // find the DatanodeDescriptor objects
4303        ArrayList<DatanodeDescriptor> trimmedTargets =
4304            new ArrayList<DatanodeDescriptor>(newtargets.length);
4305        ArrayList<String> trimmedStorages =
4306            new ArrayList<String>(newtargets.length);
4307        if (newtargets.length > 0) {
4308          for (int i = 0; i < newtargets.length; ++i) {
4309            // try to get targetNode
4310            DatanodeDescriptor targetNode =
4311                blockManager.getDatanodeManager().getDatanode(newtargets[i]);
4312            if (targetNode != null) {
4313              trimmedTargets.add(targetNode);
4314              trimmedStorages.add(newtargetstorages[i]);
4315            } else if (LOG.isDebugEnabled()) {
4316              LOG.debug("DatanodeDescriptor (=" + newtargets[i] + ") not found");
4317            }
4318          }
4319        }
4320        if ((closeFile) && !trimmedTargets.isEmpty()) {
4321          // the file is getting closed. Insert block locations into blockManager.
4322          // Otherwise fsck will report these blocks as MISSING, especially if the
4323          // blocksReceived from Datanodes take a long time to arrive.
4324          for (int i = 0; i < trimmedTargets.size(); i++) {
4325            DatanodeStorageInfo storageInfo =
4326                trimmedTargets.get(i).getStorageInfo(trimmedStorages.get(i));
4327            if (storageInfo != null) {
4328              if(copyTruncate) {
4329                storageInfo.addBlock(truncatedBlock);
4330              } else {
4331                storageInfo.addBlock(storedBlock);
4332              }
4333            }
4334          }
4335        }
4336
4337        // add pipeline locations into the INodeUnderConstruction
4338        DatanodeStorageInfo[] trimmedStorageInfos =
4339            blockManager.getDatanodeManager().getDatanodeStorageInfos(
4340                trimmedTargets.toArray(new DatanodeID[trimmedTargets.size()]),
4341                trimmedStorages.toArray(new String[trimmedStorages.size()]),
4342                "src=%s, oldBlock=%s, newgenerationstamp=%d, newlength=%d",
4343                src, oldBlock, newgenerationstamp, newlength);
4344
4345        if(copyTruncate) {
4346          iFile.setLastBlock(truncatedBlock, trimmedStorageInfos);
4347        } else {
4348          iFile.setLastBlock(storedBlock, trimmedStorageInfos);
4349          if (closeFile) {
4350            blockManager.markBlockReplicasAsCorrupt(storedBlock,
4351                oldGenerationStamp, oldNumBytes, trimmedStorageInfos);
4352          }
4353        }
4354      }
4355
4356      if (closeFile) {
4357        if(copyTruncate) {
4358          closeFileCommitBlocks(src, iFile, truncatedBlock);
4359          if(!iFile.isBlockInLatestSnapshot(storedBlock)) {
4360            blockManager.removeBlock(storedBlock);
4361          }
4362        } else {
4363          closeFileCommitBlocks(src, iFile, storedBlock);
4364        }
4365      } else {
4366        // If this commit does not want to close the file, persist blocks
4367        persistBlocks(src, iFile, false);
4368      }
4369    } finally {
4370      writeUnlock("commitBlockSynchronization");
4371    }
4372    getEditLog().logSync();
4373    if (closeFile) {
4374      LOG.info("commitBlockSynchronization(oldBlock=" + oldBlock
4375          + ", file=" + src
4376          + ", newgenerationstamp=" + newgenerationstamp
4377          + ", newlength=" + newlength
4378          + ", newtargets=" + Arrays.asList(newtargets) + ") successful");
4379    } else {
4380      LOG.info("commitBlockSynchronization(" + oldBlock + ") successful");
4381    }
4382  }
4383
4384  /**
4385   * @param pendingFile open file that needs to be closed
4386   * @param storedBlock last block
4387   * @throws IOException on error
4388   */
4389  @VisibleForTesting
4390  void closeFileCommitBlocks(String src, INodeFile pendingFile,
4391      BlockInfoContiguous storedBlock) throws IOException {
4392    final INodesInPath iip = INodesInPath.fromINode(pendingFile);
4393
4394    // commit the last block and complete it if it has minimum replicas
4395    commitOrCompleteLastBlock(pendingFile, iip, storedBlock);
4396
4397    //remove lease, close file
4398    finalizeINodeFileUnderConstruction(src, pendingFile,
4399        Snapshot.findLatestSnapshot(pendingFile, Snapshot.CURRENT_STATE_ID));
4400  }
4401
4402  /**
4403   * Renew the lease(s) held by the given client
4404   */
4405  void renewLease(String holder) throws IOException {
4406    checkOperation(OperationCategory.WRITE);
4407    readLock();
4408    try {
4409      checkOperation(OperationCategory.WRITE);
4410      checkNameNodeSafeMode("Cannot renew lease for " + holder);
4411      leaseManager.renewLease(holder);
4412    } finally {
4413      readUnlock("renewLease");
4414    }
4415  }
4416
4417  /**
4418   * Get a partial listing of the indicated directory
4419   *
4420   * @param src the directory name
4421   * @param startAfter the name to start after
4422   * @param needLocation if blockLocations need to be returned
4423   * @return a partial listing starting after startAfter
4424   * 
4425   * @throws AccessControlException if access is denied
4426   * @throws UnresolvedLinkException if symbolic link is encountered
4427   * @throws IOException if other I/O error occurred
4428   */
4429  DirectoryListing getListing(String src, byte[] startAfter,
4430      boolean needLocation) 
4431      throws IOException {
4432    checkOperation(OperationCategory.READ);
4433    final String operationName = "listStatus";
4434    DirectoryListing dl = null;
4435    readLock();
4436    try {
4437      checkOperation(NameNode.OperationCategory.READ);
4438      dl = FSDirStatAndListingOp.getListingInt(dir, src, startAfter,
4439          needLocation);
4440    } catch (AccessControlException e) {
4441      logAuditEvent(false, operationName, src);
4442      throw e;
4443    } finally {
4444      readUnlock(operationName);
4445    }
4446    logAuditEvent(true, operationName, src);
4447    return dl;
4448  }
4449
4450  /////////////////////////////////////////////////////////
4451  //
4452  // These methods are called by datanodes
4453  //
4454  /////////////////////////////////////////////////////////
4455  /**
4456   * Register Datanode.
4457   * <p>
4458   * The purpose of registration is to identify whether the new datanode
4459   * serves a new data storage, and will report new data block copies,
4460   * which the namenode was not aware of; or the datanode is a replacement
4461   * node for the data storage that was previously served by a different
4462   * or the same (in terms of host:port) datanode.
4463   * The data storages are distinguished by their storageIDs. When a new
4464   * data storage is reported the namenode issues a new unique storageID.
4465   * <p>
4466   * Finally, the namenode returns its namespaceID as the registrationID
4467   * for the datanodes. 
4468   * namespaceID is a persistent attribute of the name space.
4469   * The registrationID is checked every time the datanode is communicating
4470   * with the namenode. 
4471   * Datanodes with inappropriate registrationID are rejected.
4472   * If the namenode stops, and then restarts it can restore its 
4473   * namespaceID and will continue serving the datanodes that has previously
4474   * registered with the namenode without restarting the whole cluster.
4475   * 
4476   * @see org.apache.hadoop.hdfs.server.datanode.DataNode
4477   */
4478  void registerDatanode(DatanodeRegistration nodeReg) throws IOException {
4479    writeLock();
4480    try {
4481      getBlockManager().getDatanodeManager().registerDatanode(nodeReg);
4482      checkSafeMode();
4483    } finally {
4484      writeUnlock("registerDatanode");
4485    }
4486  }
4487  
4488  /**
4489   * Get registrationID for datanodes based on the namespaceID.
4490   * 
4491   * @see #registerDatanode(DatanodeRegistration)
4492   * @return registration ID
4493   */
4494  String getRegistrationID() {
4495    return Storage.getRegistrationID(getFSImage().getStorage());
4496  }
4497
4498  /**
4499   * The given node has reported in.  This method should:
4500   * 1) Record the heartbeat, so the datanode isn't timed out
4501   * 2) Adjust usage stats for future block allocation
4502   * 
4503   * If a substantial amount of time passed since the last datanode 
4504   * heartbeat then request an immediate block report.  
4505   * 
4506   * @return an array of datanode commands 
4507   * @throws IOException
4508   */
4509  HeartbeatResponse handleHeartbeat(DatanodeRegistration nodeReg,
4510      StorageReport[] reports, long cacheCapacity, long cacheUsed,
4511      int xceiverCount, int xmitsInProgress, int failedVolumes,
4512      VolumeFailureSummary volumeFailureSummary) throws IOException {
4513    readLock();
4514    try {
4515      //get datanode commands
4516      final int maxTransfer = blockManager.getMaxReplicationStreams()
4517          - xmitsInProgress;
4518      DatanodeCommand[] cmds = blockManager.getDatanodeManager().handleHeartbeat(
4519          nodeReg, reports, blockPoolId, cacheCapacity, cacheUsed,
4520          xceiverCount, maxTransfer, failedVolumes, volumeFailureSummary);
4521      
4522      //create ha status
4523      final NNHAStatusHeartbeat haState = new NNHAStatusHeartbeat(
4524          haContext.getState().getServiceState(),
4525          getFSImage().getCorrectLastAppliedOrWrittenTxId());
4526
4527      return new HeartbeatResponse(cmds, haState, rollingUpgradeInfo);
4528    } finally {
4529      readUnlock("handleHeartbeat");
4530    }
4531  }
4532
4533  /**
4534   * Returns whether or not there were available resources at the last check of
4535   * resources.
4536   *
4537   * @return true if there were sufficient resources available, false otherwise.
4538   */
4539  boolean nameNodeHasResourcesAvailable() {
4540    return hasResourcesAvailable;
4541  }
4542
4543  /**
4544   * Perform resource checks and cache the results.
4545   */
4546  void checkAvailableResources() {
4547    Preconditions.checkState(nnResourceChecker != null,
4548        "nnResourceChecker not initialized");
4549    hasResourcesAvailable = nnResourceChecker.hasAvailableDiskSpace();
4550  }
4551
4552  /**
4553   * Persist the block list for the inode.
4554   * @param path
4555   * @param file
4556   * @param logRetryCache
4557   */
4558  private void persistBlocks(String path, INodeFile file,
4559                             boolean logRetryCache) {
4560    assert hasWriteLock();
4561    Preconditions.checkArgument(file.isUnderConstruction());
4562    getEditLog().logUpdateBlocks(path, file, logRetryCache);
4563    NameNode.stateChangeLog.debug("persistBlocks: {} with {} blocks is" +
4564        " peristed to the file system", path, file.getBlocks().length);
4565  }
4566
4567  /**
4568   * Close file.
4569   * @param path
4570   * @param file
4571   */
4572  private void closeFile(String path, INodeFile file) {
4573    assert hasWriteLock();
4574    waitForLoadingFSImage();
4575    // file is closed
4576    getEditLog().logCloseFile(path, file);
4577    NameNode.stateChangeLog.debug("closeFile: {} with {} blocks is persisted" +
4578        " to the file system", path, file.getBlocks().length);
4579  }
4580
4581  /**
4582   * Periodically calls hasAvailableResources of NameNodeResourceChecker, and if
4583   * there are found to be insufficient resources available, causes the NN to
4584   * enter safe mode. If resources are later found to have returned to
4585   * acceptable levels, this daemon will cause the NN to exit safe mode.
4586   */
4587  class NameNodeResourceMonitor implements Runnable  {
4588    boolean shouldNNRmRun = true;
4589    @Override
4590    public void run () {
4591      try {
4592        while (fsRunning && shouldNNRmRun) {
4593          checkAvailableResources();
4594          if(!nameNodeHasResourcesAvailable()) {
4595            String lowResourcesMsg = "NameNode low on available disk space. ";
4596            if (!isInSafeMode()) {
4597              LOG.warn(lowResourcesMsg + "Entering safe mode.");
4598            } else {
4599              LOG.warn(lowResourcesMsg + "Already in safe mode.");
4600            }
4601            enterSafeMode(true);
4602          }
4603          try {
4604            Thread.sleep(resourceRecheckInterval);
4605          } catch (InterruptedException ie) {
4606            // Deliberately ignore
4607          }
4608        }
4609      } catch (Exception e) {
4610        FSNamesystem.LOG.error("Exception in NameNodeResourceMonitor: ", e);
4611      }
4612    }
4613
4614    public void stopMonitor() {
4615      shouldNNRmRun = false;
4616    }
4617 }
4618
4619  class NameNodeEditLogRoller implements Runnable {
4620
4621    private boolean shouldRun = true;
4622    private final long rollThreshold;
4623    private final long sleepIntervalMs;
4624
4625    public NameNodeEditLogRoller(long rollThreshold, int sleepIntervalMs) {
4626        this.rollThreshold = rollThreshold;
4627        this.sleepIntervalMs = sleepIntervalMs;
4628    }
4629
4630    @Override
4631    public void run() {
4632      while (fsRunning && shouldRun) {
4633        try {
4634          FSEditLog editLog = getFSImage().getEditLog();
4635          long numEdits =
4636              editLog.getLastWrittenTxId() - editLog.getCurSegmentTxId();
4637          if (numEdits > rollThreshold) {
4638            FSNamesystem.LOG.info("NameNode rolling its own edit log because"
4639                + " number of edits in open segment exceeds threshold of "
4640                + rollThreshold);
4641            rollEditLog();
4642          }
4643        } catch (Exception e) {
4644          FSNamesystem.LOG.error("Swallowing exception in "
4645              + NameNodeEditLogRoller.class.getSimpleName() + ":", e);
4646        }
4647        try {
4648          Thread.sleep(sleepIntervalMs);
4649        } catch (InterruptedException e) {
4650          FSNamesystem.LOG.info(NameNodeEditLogRoller.class.getSimpleName()
4651              + " was interrupted, exiting");
4652          break;
4653        }
4654      }
4655    }
4656
4657    public void stop() {
4658      shouldRun = false;
4659    }
4660  }
4661
4662  /**
4663   * Daemon to periodically scan the namespace for lazyPersist files
4664   * with missing blocks and unlink them.
4665   */
4666  class LazyPersistFileScrubber implements Runnable {
4667    private volatile boolean shouldRun = true;
4668    final int scrubIntervalSec;
4669    public LazyPersistFileScrubber(final int scrubIntervalSec) {
4670      this.scrubIntervalSec = scrubIntervalSec;
4671    }
4672
4673    /**
4674     * Periodically go over the list of lazyPersist files with missing
4675     * blocks and unlink them from the namespace.
4676     */
4677    private void clearCorruptLazyPersistFiles()
4678        throws IOException {
4679
4680      BlockStoragePolicy lpPolicy = blockManager.getStoragePolicy("LAZY_PERSIST");
4681
4682      List<BlockCollection> filesToDelete = new ArrayList<>();
4683      boolean changed = false;
4684      writeLock();
4685      try {
4686        final Iterator<Block> it = blockManager.getCorruptReplicaBlockIterator();
4687
4688        while (it.hasNext()) {
4689          Block b = it.next();
4690          BlockInfoContiguous blockInfo = blockManager.getStoredBlock(b);
4691          if (blockInfo == null) {
4692            LOG.info("Cannot find block info for block " + b);
4693          } else {
4694            if (blockInfo.getBlockCollection().getStoragePolicyID()
4695                == lpPolicy.getId()) {
4696              filesToDelete.add(blockInfo.getBlockCollection());
4697            }
4698          }
4699        }
4700
4701        for (BlockCollection bc : filesToDelete) {
4702          LOG.warn("Removing lazyPersist file " + bc.getName() + " with no replicas.");
4703          BlocksMapUpdateInfo toRemoveBlocks =
4704              FSDirDeleteOp.deleteInternal(
4705                  FSNamesystem.this, bc.getName(),
4706                  INodesInPath.fromINode((INodeFile) bc), false);
4707          changed |= toRemoveBlocks != null;
4708          if (toRemoveBlocks != null) {
4709            removeBlocks(toRemoveBlocks); // Incremental deletion of blocks
4710          }
4711        }
4712      } finally {
4713        writeUnlock("clearCorruptLazyPersistFiles");
4714      }
4715      if (changed) {
4716        getEditLog().logSync();
4717      }
4718    }
4719
4720    @Override
4721    public void run() {
4722      while (fsRunning && shouldRun) {
4723        try {
4724          clearCorruptLazyPersistFiles();
4725        } catch (Exception e) {
4726          FSNamesystem.LOG.error(
4727              "Ignoring exception in LazyPersistFileScrubber:", e);
4728        }
4729
4730        try {
4731          Thread.sleep(scrubIntervalSec * 1000);
4732        } catch (InterruptedException e) {
4733          FSNamesystem.LOG.info(
4734              "LazyPersistFileScrubber was interrupted, exiting");
4735          break;
4736        }
4737      }
4738    }
4739
4740    public void stop() {
4741      shouldRun = false;
4742    }
4743  }
4744
4745  public FSImage getFSImage() {
4746    return fsImage;
4747  }
4748
4749  public FSEditLog getEditLog() {
4750    return getFSImage().getEditLog();
4751  }    
4752
4753  private void checkBlock(ExtendedBlock block) throws IOException {
4754    if (block != null && !this.blockPoolId.equals(block.getBlockPoolId())) {
4755      throw new IOException("Unexpected BlockPoolId " + block.getBlockPoolId()
4756          + " - expected " + blockPoolId);
4757    }
4758  }
4759
4760  @Metric({"MissingBlocks", "Number of missing blocks"})
4761  public long getMissingBlocksCount() {
4762    // not locking
4763    return blockManager.getMissingBlocksCount();
4764  }
4765
4766  @Metric({"MissingReplOneBlocks", "Number of missing blocks " +
4767      "with replication factor 1"})
4768  public long getMissingReplOneBlocksCount() {
4769    // not locking
4770    return blockManager.getMissingReplOneBlocksCount();
4771  }
4772  
4773  @Metric({"ExpiredHeartbeats", "Number of expired heartbeats"})
4774  public int getExpiredHeartbeats() {
4775    return datanodeStatistics.getExpiredHeartbeats();
4776  }
4777  
4778  @Metric({"TransactionsSinceLastCheckpoint",
4779      "Number of transactions since last checkpoint"})
4780  public long getTransactionsSinceLastCheckpoint() {
4781    return getEditLog().getLastWrittenTxIdWithoutLock() -
4782        getFSImage().getStorage().getMostRecentCheckpointTxId();
4783  }
4784  
4785  @Metric({"TransactionsSinceLastLogRoll",
4786      "Number of transactions since last edit log roll"})
4787  public long getTransactionsSinceLastLogRoll() {
4788    if (isInStandbyState() || !getEditLog().isSegmentOpenWithoutLock()) {
4789      return 0;
4790    } else {
4791      return getEditLog().getLastWrittenTxIdWithoutLock() -
4792          getEditLog().getCurSegmentTxIdWithoutLock() + 1;
4793    }
4794  }
4795
4796  @Metric({"LastWrittenTransactionId", "Transaction ID written to the edit log"})
4797  public long getLastWrittenTransactionId() {
4798    return getEditLog().getLastWrittenTxIdWithoutLock();
4799  }
4800  
4801  @Metric({"LastCheckpointTime",
4802      "Time in milliseconds since the epoch of the last checkpoint"})
4803  public long getLastCheckpointTime() {
4804    return getFSImage().getStorage().getMostRecentCheckpointTime();
4805  }
4806
4807  /** @see ClientProtocol#getStats() */
4808  long[] getStats() {
4809    final long[] stats = datanodeStatistics.getStats();
4810    stats[ClientProtocol.GET_STATS_UNDER_REPLICATED_IDX] = getUnderReplicatedBlocks();
4811    stats[ClientProtocol.GET_STATS_CORRUPT_BLOCKS_IDX] = getCorruptReplicaBlocks();
4812    stats[ClientProtocol.GET_STATS_MISSING_BLOCKS_IDX] = getMissingBlocksCount();
4813    stats[ClientProtocol.GET_STATS_MISSING_REPL_ONE_BLOCKS_IDX] =
4814        getMissingReplOneBlocksCount();
4815    return stats;
4816  }
4817
4818  @Override // FSNamesystemMBean
4819  @Metric({"CapacityTotal",
4820      "Total raw capacity of data nodes in bytes"})
4821  public long getCapacityTotal() {
4822    return datanodeStatistics.getCapacityTotal();
4823  }
4824
4825  @Metric({"CapacityTotalGB",
4826      "Total raw capacity of data nodes in GB"})
4827  public float getCapacityTotalGB() {
4828    return DFSUtil.roundBytesToGB(getCapacityTotal());
4829  }
4830
4831  @Override // FSNamesystemMBean
4832  @Metric({"CapacityUsed",
4833      "Total used capacity across all data nodes in bytes"})
4834  public long getCapacityUsed() {
4835    return datanodeStatistics.getCapacityUsed();
4836  }
4837
4838  @Metric({"CapacityUsedGB",
4839      "Total used capacity across all data nodes in GB"})
4840  public float getCapacityUsedGB() {
4841    return DFSUtil.roundBytesToGB(getCapacityUsed());
4842  }
4843
4844  @Override // FSNamesystemMBean
4845  @Metric({"CapacityRemaining", "Remaining capacity in bytes"})
4846  public long getCapacityRemaining() {
4847    return datanodeStatistics.getCapacityRemaining();
4848  }
4849
4850  @Metric({"CapacityRemainingGB", "Remaining capacity in GB"})
4851  public float getCapacityRemainingGB() {
4852    return DFSUtil.roundBytesToGB(getCapacityRemaining());
4853  }
4854
4855  @Metric({"CapacityUsedNonDFS",
4856      "Total space used by data nodes for non DFS purposes in bytes"})
4857  public long getCapacityUsedNonDFS() {
4858    return datanodeStatistics.getCapacityUsedNonDFS();
4859  }
4860
4861  /**
4862   * Total number of connections.
4863   */
4864  @Override // FSNamesystemMBean
4865  @Metric
4866  public int getTotalLoad() {
4867    return datanodeStatistics.getXceiverCount();
4868  }
4869  
4870  @Metric({ "SnapshottableDirectories", "Number of snapshottable directories" })
4871  public int getNumSnapshottableDirs() {
4872    return this.snapshotManager.getNumSnapshottableDirs();
4873  }
4874
4875  @Metric({ "Snapshots", "The number of snapshots" })
4876  public int getNumSnapshots() {
4877    return this.snapshotManager.getNumSnapshots();
4878  }
4879
4880  @Override
4881  public String getSnapshotStats() {
4882    Map<String, Object> info = new HashMap<String, Object>();
4883    info.put("SnapshottableDirectories", this.getNumSnapshottableDirs());
4884    info.put("Snapshots", this.getNumSnapshots());
4885    return JSON.toString(info);
4886  }
4887
4888  @Override // FSNamesystemMBean
4889  @Metric({ "NumEncryptionZones", "The number of encryption zones" })
4890  public int getNumEncryptionZones() {
4891    return dir.ezManager.getNumEncryptionZones();
4892  }
4893
4894  /**
4895   * Returns the length of the wait Queue for the FSNameSystemLock.
4896   *
4897   * A larger number here indicates lots of threads are waiting for
4898   * FSNameSystemLock.
4899   *
4900   * @return int - Number of Threads waiting to acquire FSNameSystemLock
4901   */
4902  @Override
4903  @Metric({"LockQueueLength", "Number of threads waiting to " +
4904      "acquire FSNameSystemLock"})
4905  public int getFsLockQueueLength() {
4906    return fsLock.getQueueLength();
4907  }
4908
4909  int getNumberOfDatanodes(DatanodeReportType type) {
4910    readLock();
4911    try {
4912      return getBlockManager().getDatanodeManager().getDatanodeListForReport(
4913          type).size(); 
4914    } finally {
4915      readUnlock("getNumberOfDatanodes");
4916    }
4917  }
4918
4919  DatanodeInfo[] datanodeReport(final DatanodeReportType type
4920      ) throws AccessControlException, StandbyException {
4921    checkSuperuserPrivilege();
4922    checkOperation(OperationCategory.UNCHECKED);
4923    readLock();
4924    try {
4925      checkOperation(OperationCategory.UNCHECKED);
4926      final DatanodeManager dm = getBlockManager().getDatanodeManager();      
4927      final List<DatanodeDescriptor> results = dm.getDatanodeListForReport(type);
4928
4929      DatanodeInfo[] arr = new DatanodeInfo[results.size()];
4930      for (int i=0; i<arr.length; i++) {
4931        arr[i] = new DatanodeInfo(results.get(i));
4932      }
4933      return arr;
4934    } finally {
4935      readUnlock("datanodeReport");
4936    }
4937  }
4938
4939  DatanodeStorageReport[] getDatanodeStorageReport(final DatanodeReportType type
4940      ) throws AccessControlException, StandbyException {
4941    checkSuperuserPrivilege();
4942    checkOperation(OperationCategory.UNCHECKED);
4943    readLock();
4944    try {
4945      checkOperation(OperationCategory.UNCHECKED);
4946      final DatanodeManager dm = getBlockManager().getDatanodeManager();      
4947      final List<DatanodeDescriptor> datanodes = dm.getDatanodeListForReport(type);
4948
4949      DatanodeStorageReport[] reports = new DatanodeStorageReport[datanodes.size()];
4950      for (int i = 0; i < reports.length; i++) {
4951        final DatanodeDescriptor d = datanodes.get(i);
4952        reports[i] = new DatanodeStorageReport(new DatanodeInfo(d),
4953            d.getStorageReports());
4954      }
4955      return reports;
4956    } finally {
4957      readUnlock("getDatanodeStorageReport");
4958    }
4959  }
4960
4961  /**
4962   * Save namespace image.
4963   * This will save current namespace into fsimage file and empty edits file.
4964   * Requires superuser privilege and safe mode.
4965   * 
4966   * @throws AccessControlException if superuser privilege is violated.
4967   * @throws IOException if 
4968   */
4969  void saveNamespace() throws AccessControlException, IOException {
4970    checkOperation(OperationCategory.UNCHECKED);
4971    checkSuperuserPrivilege();
4972
4973    cpLock();  // Block if a checkpointing is in progress on standby.
4974    readLock();
4975    try {
4976      checkOperation(OperationCategory.UNCHECKED);
4977
4978      if (!isInSafeMode()) {
4979        throw new IOException("Safe mode should be turned ON "
4980            + "in order to create namespace image.");
4981      }
4982      getFSImage().saveNamespace(this);
4983    } finally {
4984      readUnlock("saveNamespace");
4985      cpUnlock();
4986    }
4987    LOG.info("New namespace image has been created");
4988  }
4989  
4990  /**
4991   * Enables/Disables/Checks restoring failed storage replicas if the storage becomes available again.
4992   * Requires superuser privilege.
4993   * 
4994   * @throws AccessControlException if superuser privilege is violated.
4995   */
4996  boolean restoreFailedStorage(String arg) throws AccessControlException,
4997      StandbyException {
4998    checkSuperuserPrivilege();
4999    checkOperation(OperationCategory.UNCHECKED);
5000    cpLock();  // Block if a checkpointing is in progress on standby.
5001    writeLock();
5002    try {
5003      checkOperation(OperationCategory.UNCHECKED);
5004      
5005      // if it is disabled - enable it and vice versa.
5006      if(arg.equals("check"))
5007        return getFSImage().getStorage().getRestoreFailedStorage();
5008      
5009      boolean val = arg.equals("true");  // false if not
5010      getFSImage().getStorage().setRestoreFailedStorage(val);
5011      
5012      return val;
5013    } finally {
5014      writeUnlock("restoreFailedStorage");
5015      cpUnlock();
5016    }
5017  }
5018
5019  Date getStartTime() {
5020    return new Date(startTime); 
5021  }
5022    
5023  void finalizeUpgrade() throws IOException {
5024    checkSuperuserPrivilege();
5025    checkOperation(OperationCategory.UNCHECKED);
5026    cpLock();  // Block if a checkpointing is in progress on standby.
5027    writeLock();
5028    try {
5029      checkOperation(OperationCategory.UNCHECKED);
5030      getFSImage().finalizeUpgrade(this.isHaEnabled() && inActiveState());
5031    } finally {
5032      writeUnlock("finalizeUpgrade");
5033      cpUnlock();
5034    }
5035  }
5036
5037  void refreshNodes() throws IOException {
5038    checkOperation(OperationCategory.UNCHECKED);
5039    checkSuperuserPrivilege();
5040    getBlockManager().getDatanodeManager().refreshNodes(new HdfsConfiguration());
5041  }
5042
5043  void setBalancerBandwidth(long bandwidth) throws IOException {
5044    checkOperation(OperationCategory.UNCHECKED);
5045    checkSuperuserPrivilege();
5046    getBlockManager().getDatanodeManager().setBalancerBandwidth(bandwidth);
5047  }
5048
5049  /**
5050   * Persist the new block (the last block of the given file).
5051   * @param path
5052   * @param file
5053   */
5054  private void persistNewBlock(String path, INodeFile file) {
5055    Preconditions.checkArgument(file.isUnderConstruction());
5056    getEditLog().logAddBlock(path, file);
5057    NameNode.stateChangeLog.debug("persistNewBlock: {} with new block {}," +
5058        " current total block count is {}", path,
5059        file.getLastBlock().toString(), file.getBlocks().length);
5060  }
5061
5062  /**
5063   * SafeModeInfo contains information related to the safe mode.
5064   * <p>
5065   * An instance of {@link SafeModeInfo} is created when the name node
5066   * enters safe mode.
5067   * <p>
5068   * During name node startup {@link SafeModeInfo} counts the number of
5069   * <em>safe blocks</em>, those that have at least the minimal number of
5070   * replicas, and calculates the ratio of safe blocks to the total number
5071   * of blocks in the system, which is the size of blocks in
5072   * {@link FSNamesystem#blockManager}. When the ratio reaches the
5073   * {@link #threshold} it starts the SafeModeMonitor daemon in order
5074   * to monitor whether the safe mode {@link #extension} is passed.
5075   * Then it leaves safe mode and destroys itself.
5076   * <p>
5077   * If safe mode is turned on manually then the number of safe blocks is
5078   * not tracked because the name node is not intended to leave safe mode
5079   * automatically in the case.
5080   *
5081   * @see ClientProtocol#setSafeMode(HdfsConstants.SafeModeAction, boolean)
5082   */
5083  public class SafeModeInfo {
5084    // configuration fields
5085    /** Safe mode threshold condition %.*/
5086    private final double threshold;
5087    /** Safe mode minimum number of datanodes alive */
5088    private final int datanodeThreshold;
5089    /**
5090     * Safe mode extension after the threshold.
5091     * Make it volatile so that getSafeModeTip can read the latest value
5092     * without taking a lock.
5093     */
5094    private volatile int extension;
5095    /** Min replication required by safe mode. */
5096    private final int safeReplication;
5097    /** threshold for populating needed replication queues */
5098    private final double replQueueThreshold;
5099    // internal fields
5100    /** Time when threshold was reached.
5101     * <br> -1 safe mode is off
5102     * <br> 0 safe mode is on, and threshold is not reached yet
5103     * <br> >0 safe mode is on, but we are in extension period 
5104     */
5105    private long reached = -1;  
5106    private long reachedTimestamp = -1;
5107    /** Total number of blocks. */
5108    int blockTotal; 
5109    /** Number of safe blocks. */
5110    int blockSafe;
5111    /** Number of blocks needed to satisfy safe mode threshold condition */
5112    private int blockThreshold;
5113    /** Number of blocks needed before populating replication queues */
5114    private int blockReplQueueThreshold;
5115    /** time of the last status printout */
5116    private long lastStatusReport = 0;
5117    /**
5118     * Was safemode entered automatically because available resources were low.
5119     * Make it volatile so that getSafeModeTip can read the latest value
5120     * without taking a lock.
5121     */
5122    private volatile boolean resourcesLow = false;
5123    /** Should safemode adjust its block totals as blocks come in */
5124    private boolean shouldIncrementallyTrackBlocks = false;
5125    /** counter for tracking startup progress of reported blocks */
5126    private Counter awaitingReportedBlocksCounter;
5127    
5128    /**
5129     * Creates SafeModeInfo when the name node enters
5130     * automatic safe mode at startup.
5131     *  
5132     * @param conf configuration
5133     */
5134    private SafeModeInfo(Configuration conf) {
5135      this.threshold = conf.getFloat(DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_KEY,
5136          DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_DEFAULT);
5137      if(threshold > 1.0) {
5138        LOG.warn("The threshold value should't be greater than 1, threshold: " + threshold);
5139      }
5140      this.datanodeThreshold = conf.getInt(
5141        DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY,
5142        DFS_NAMENODE_SAFEMODE_MIN_DATANODES_DEFAULT);
5143      this.extension = conf.getInt(DFS_NAMENODE_SAFEMODE_EXTENSION_KEY, 0);
5144      this.safeReplication = conf.getInt(DFS_NAMENODE_REPLICATION_MIN_KEY, 
5145                                         DFS_NAMENODE_REPLICATION_MIN_DEFAULT);
5146      
5147      LOG.info(DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_KEY + " = " + threshold);
5148      LOG.info(DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY + " = " + datanodeThreshold);
5149      LOG.info(DFS_NAMENODE_SAFEMODE_EXTENSION_KEY + "     = " + extension);
5150
5151      // default to safe mode threshold (i.e., don't populate queues before leaving safe mode)
5152      this.replQueueThreshold = 
5153        conf.getFloat(DFS_NAMENODE_REPL_QUEUE_THRESHOLD_PCT_KEY,
5154                      (float) threshold);
5155      this.blockTotal = 0; 
5156      this.blockSafe = 0;
5157    }
5158
5159    /**
5160     * In the HA case, the StandbyNode can be in safemode while the namespace
5161     * is modified by the edit log tailer. In this case, the number of total
5162     * blocks changes as edits are processed (eg blocks are added and deleted).
5163     * However, we don't want to do the incremental tracking during the
5164     * startup-time loading process -- only once the initial total has been
5165     * set after the image has been loaded.
5166     */
5167    private boolean shouldIncrementallyTrackBlocks() {
5168      return shouldIncrementallyTrackBlocks;
5169    }
5170
5171    /**
5172     * Creates SafeModeInfo when safe mode is entered manually, or because
5173     * available resources are low.
5174     *
5175     * The {@link #threshold} is set to 1.5 so that it could never be reached.
5176     * {@link #blockTotal} is set to -1 to indicate that safe mode is manual.
5177     * 
5178     * @see SafeModeInfo
5179     */
5180    private SafeModeInfo(boolean resourcesLow) {
5181      this.threshold = 1.5f;  // this threshold can never be reached
5182      this.datanodeThreshold = Integer.MAX_VALUE;
5183      this.extension = Integer.MAX_VALUE;
5184      this.safeReplication = Short.MAX_VALUE + 1; // more than maxReplication
5185      this.replQueueThreshold = 1.5f; // can never be reached
5186      this.blockTotal = -1;
5187      this.blockSafe = -1;
5188      this.resourcesLow = resourcesLow;
5189      enter();
5190      reportStatus("STATE* Safe mode is ON.", true);
5191    }
5192      
5193    /**
5194     * Check if safe mode is on.
5195     * @return true if in safe mode
5196     */
5197    private synchronized boolean isOn() {
5198      doConsistencyCheck();
5199      return this.reached >= 0;
5200    }
5201      
5202    /**
5203     * Enter safe mode.
5204     */
5205    private void enter() {
5206      this.reached = 0;
5207      this.reachedTimestamp = 0;
5208    }
5209      
5210    /**
5211     * Leave safe mode.
5212     * <p>
5213     * Check for invalid, under- & over-replicated blocks in the end of startup.
5214     */
5215    private synchronized void leave() {
5216      // if not done yet, initialize replication queues.
5217      // In the standby, do not populate repl queues
5218      if (!isPopulatingReplQueues() && shouldPopulateReplQueues()) {
5219        initializeReplQueues();
5220      }
5221      long timeInSafemode = now() - startTime;
5222      NameNode.stateChangeLog.info("STATE* Leaving safe mode after " 
5223                                    + timeInSafemode/1000 + " secs");
5224      NameNode.getNameNodeMetrics().setSafeModeTime((int) timeInSafemode);
5225
5226      //Log the following only once (when transitioning from ON -> OFF)
5227      if (reached >= 0) {
5228        NameNode.stateChangeLog.info("STATE* Safe mode is OFF"); 
5229      }
5230      reached = -1;
5231      reachedTimestamp = -1;
5232      safeMode = null;
5233      final NetworkTopology nt = blockManager.getDatanodeManager().getNetworkTopology();
5234      NameNode.stateChangeLog.info("STATE* Network topology has "
5235          + nt.getNumOfRacks() + " racks and "
5236          + nt.getNumOfLeaves() + " datanodes");
5237      NameNode.stateChangeLog.info("STATE* UnderReplicatedBlocks has "
5238          + blockManager.numOfUnderReplicatedBlocks() + " blocks");
5239
5240      startSecretManagerIfNecessary();
5241
5242      // If startup has not yet completed, end safemode phase.
5243      StartupProgress prog = NameNode.getStartupProgress();
5244      if (prog.getStatus(Phase.SAFEMODE) != Status.COMPLETE) {
5245        prog.endStep(Phase.SAFEMODE, STEP_AWAITING_REPORTED_BLOCKS);
5246        prog.endPhase(Phase.SAFEMODE);
5247      }
5248    }
5249
5250    /**
5251     * Check whether we have reached the threshold for 
5252     * initializing replication queues.
5253     */
5254    private synchronized boolean canInitializeReplQueues() {
5255      return shouldPopulateReplQueues()
5256          && blockSafe >= blockReplQueueThreshold;
5257    }
5258      
5259    /** 
5260     * Safe mode can be turned off iff 
5261     * the threshold is reached and 
5262     * the extension time have passed.
5263     * @return true if can leave or false otherwise.
5264     */
5265    private synchronized boolean canLeave() {
5266      if (reached == 0) {
5267        return false;
5268      }
5269
5270      if (monotonicNow() - reached < extension) {
5271        reportStatus("STATE* Safe mode ON, in safe mode extension.", false);
5272        return false;
5273      }
5274
5275      if (needEnter()) {
5276        reportStatus("STATE* Safe mode ON, thresholds not met.", false);
5277        return false;
5278      }
5279
5280      return true;
5281    }
5282      
5283    /** 
5284     * There is no need to enter safe mode 
5285     * if DFS is empty or {@link #threshold} == 0
5286     */
5287    private boolean needEnter() {
5288      return (threshold != 0 && blockSafe < blockThreshold) ||
5289        (datanodeThreshold != 0 && getNumLiveDataNodes() < datanodeThreshold) ||
5290        (!nameNodeHasResourcesAvailable());
5291    }
5292      
5293    /**
5294     * Check and trigger safe mode if needed. 
5295     */
5296    private void checkMode() {
5297      // Have to have write-lock since leaving safemode initializes
5298      // repl queues, which requires write lock
5299      assert hasWriteLock();
5300      if (inTransitionToActive()) {
5301        return;
5302      }
5303      // if smmthread is already running, the block threshold must have been 
5304      // reached before, there is no need to enter the safe mode again
5305      if (smmthread == null && needEnter()) {
5306        enter();
5307        // check if we are ready to initialize replication queues
5308        if (canInitializeReplQueues() && !isPopulatingReplQueues()
5309            && !haEnabled) {
5310          initializeReplQueues();
5311        }
5312        reportStatus("STATE* Safe mode ON.", false);
5313        return;
5314      }
5315      // the threshold is reached or was reached before
5316      if (!isOn() ||                           // safe mode is off
5317          extension <= 0 || threshold <= 0) {  // don't need to wait
5318        this.leave(); // leave safe mode
5319        return;
5320      }
5321      if (reached > 0) {  // threshold has already been reached before
5322        reportStatus("STATE* Safe mode ON.", false);
5323        return;
5324      }
5325      // start monitor
5326      reached = monotonicNow();
5327      reachedTimestamp = now();
5328      if (smmthread == null) {
5329        smmthread = new Daemon(new SafeModeMonitor());
5330        smmthread.start();
5331        reportStatus("STATE* Safe mode extension entered.", true);
5332      }
5333
5334      // check if we are ready to initialize replication queues
5335      if (canInitializeReplQueues() && !isPopulatingReplQueues() && !haEnabled) {
5336        initializeReplQueues();
5337      }
5338    }
5339      
5340    /**
5341     * Set total number of blocks.
5342     */
5343    private synchronized void setBlockTotal(int total) {
5344      this.blockTotal = total;
5345      this.blockThreshold = (int) (blockTotal * threshold);
5346      this.blockReplQueueThreshold = 
5347        (int) (blockTotal * replQueueThreshold);
5348      if (haEnabled) {
5349        // After we initialize the block count, any further namespace
5350        // modifications done while in safe mode need to keep track
5351        // of the number of total blocks in the system.
5352        this.shouldIncrementallyTrackBlocks = true;
5353      }
5354      if(blockSafe < 0)
5355        this.blockSafe = 0;
5356      checkMode();
5357    }
5358      
5359    /**
5360     * Increment number of safe blocks if current block has 
5361     * reached minimal replication.
5362     * @param replication current replication 
5363     */
5364    private synchronized void incrementSafeBlockCount(short replication) {
5365      if (replication == safeReplication) {
5366        this.blockSafe++;
5367
5368        // Report startup progress only if we haven't completed startup yet.
5369        StartupProgress prog = NameNode.getStartupProgress();
5370        if (prog.getStatus(Phase.SAFEMODE) != Status.COMPLETE) {
5371          if (this.awaitingReportedBlocksCounter == null) {
5372            this.awaitingReportedBlocksCounter = prog.getCounter(Phase.SAFEMODE,
5373              STEP_AWAITING_REPORTED_BLOCKS);
5374          }
5375          this.awaitingReportedBlocksCounter.increment();
5376        }
5377
5378        checkMode();
5379      }
5380    }
5381      
5382    /**
5383     * Decrement number of safe blocks if current block has 
5384     * fallen below minimal replication.
5385     * @param replication current replication 
5386     */
5387    private synchronized void decrementSafeBlockCount(short replication) {
5388      if (replication == safeReplication-1) {
5389        this.blockSafe--;
5390        //blockSafe is set to -1 in manual / low resources safemode
5391        assert blockSafe >= 0 || isManual() || areResourcesLow();
5392        checkMode();
5393      }
5394    }
5395
5396    /**
5397     * Check if safe mode was entered manually
5398     */
5399    private boolean isManual() {
5400      return extension == Integer.MAX_VALUE;
5401    }
5402
5403    /**
5404     * Set manual safe mode.
5405     */
5406    private synchronized void setManual() {
5407      extension = Integer.MAX_VALUE;
5408    }
5409
5410    /**
5411     * Check if safe mode was entered due to resources being low.
5412     */
5413    private boolean areResourcesLow() {
5414      return resourcesLow;
5415    }
5416
5417    /**
5418     * Set that resources are low for this instance of safe mode.
5419     */
5420    private void setResourcesLow() {
5421      resourcesLow = true;
5422    }
5423
5424    /**
5425     * A tip on how safe mode is to be turned off: manually or automatically.
5426     */
5427    String getTurnOffTip() {
5428      if(!isOn()) {
5429        return "Safe mode is OFF.";
5430      }
5431
5432      //Manual OR low-resource safemode. (Admin intervention required)
5433      String adminMsg = "It was turned on manually. ";
5434      if (areResourcesLow()) {
5435        adminMsg = "Resources are low on NN. Please add or free up more "
5436          + "resources then turn off safe mode manually. NOTE:  If you turn off"
5437          + " safe mode before adding resources, "
5438          + "the NN will immediately return to safe mode. ";
5439      }
5440      if (isManual() || areResourcesLow()) {
5441        return adminMsg
5442          + "Use \"hdfs dfsadmin -safemode leave\" to turn safe mode off.";
5443      }
5444
5445      boolean thresholdsMet = true;
5446      int numLive = getNumLiveDataNodes();
5447      String msg = "";
5448      if (blockSafe < blockThreshold) {
5449        msg += String.format(
5450          "The reported blocks %d needs additional %d"
5451          + " blocks to reach the threshold %.4f of total blocks %d.%n",
5452          blockSafe, (blockThreshold - blockSafe) + 1, threshold, blockTotal);
5453        thresholdsMet = false;
5454      } else {
5455        msg += String.format("The reported blocks %d has reached the threshold"
5456            + " %.4f of total blocks %d. ", blockSafe, threshold, blockTotal);
5457      }
5458      if (numLive < datanodeThreshold) {
5459        msg += String.format(
5460          "The number of live datanodes %d needs an additional %d live "
5461          + "datanodes to reach the minimum number %d.%n",
5462          numLive, (datanodeThreshold - numLive), datanodeThreshold);
5463        thresholdsMet = false;
5464      } else {
5465        msg += String.format("The number of live datanodes %d has reached "
5466            + "the minimum number %d. ",
5467            numLive, datanodeThreshold);
5468      }
5469      msg += (reached > 0) ? "In safe mode extension. " : "";
5470      msg += "Safe mode will be turned off automatically ";
5471
5472      if (!thresholdsMet) {
5473        msg += "once the thresholds have been reached.";
5474      } else if (reached + extension - monotonicNow() > 0) {
5475        msg += ("in " + (reached + extension - monotonicNow()) / 1000 + " seconds.");
5476      } else {
5477        msg += "soon.";
5478      }
5479
5480      return msg;
5481    }
5482
5483    /**
5484     * Print status every 20 seconds.
5485     */
5486    private void reportStatus(String msg, boolean rightNow) {
5487      long curTime = now();
5488      if(!rightNow && (curTime - lastStatusReport < 20 * 1000))
5489        return;
5490      NameNode.stateChangeLog.info(msg + " \n" + getTurnOffTip());
5491      lastStatusReport = curTime;
5492    }
5493
5494    @Override
5495    public String toString() {
5496      String resText = "Current safe blocks = " 
5497        + blockSafe 
5498        + ". Target blocks = " + blockThreshold + " for threshold = %" + threshold
5499        + ". Minimal replication = " + safeReplication + ".";
5500      if (reached > 0) 
5501        resText += " Threshold was reached " + new Date(reachedTimestamp) + ".";
5502      return resText;
5503    }
5504      
5505    /**
5506     * Checks consistency of the class state.
5507     * This is costly so only runs if asserts are enabled.
5508     */
5509    private void doConsistencyCheck() {
5510      boolean assertsOn = false;
5511      assert assertsOn = true; // set to true if asserts are on
5512      if (!assertsOn) return;
5513      
5514      if (blockTotal == -1 && blockSafe == -1) {
5515        return; // manual safe mode
5516      }
5517      int activeBlocks = blockManager.getActiveBlockCount();
5518      if ((blockTotal != activeBlocks) &&
5519          !(blockSafe >= 0 && blockSafe <= blockTotal)) {
5520        throw new AssertionError(
5521            " SafeMode: Inconsistent filesystem state: "
5522        + "SafeMode data: blockTotal=" + blockTotal
5523        + " blockSafe=" + blockSafe + "; "
5524        + "BlockManager data: active="  + activeBlocks);
5525      }
5526    }
5527
5528    private synchronized void adjustBlockTotals(int deltaSafe, int deltaTotal) {
5529      if (!shouldIncrementallyTrackBlocks) {
5530        return;
5531      }
5532      assert haEnabled;
5533      
5534      if (LOG.isDebugEnabled()) {
5535        LOG.debug("Adjusting block totals from " +
5536            blockSafe + "/" + blockTotal + " to " +
5537            (blockSafe + deltaSafe) + "/" + (blockTotal + deltaTotal));
5538      }
5539      assert blockSafe + deltaSafe >= 0 : "Can't reduce blockSafe " +
5540        blockSafe + " by " + deltaSafe + ": would be negative";
5541      assert blockTotal + deltaTotal >= 0 : "Can't reduce blockTotal " +
5542        blockTotal + " by " + deltaTotal + ": would be negative";
5543      
5544      blockSafe += deltaSafe;
5545      setBlockTotal(blockTotal + deltaTotal);
5546    }
5547  }
5548    
5549  /**
5550   * Periodically check whether it is time to leave safe mode.
5551   * This thread starts when the threshold level is reached.
5552   *
5553   */
5554  class SafeModeMonitor implements Runnable {
5555    /** interval in msec for checking safe mode: {@value} */
5556    private static final long recheckInterval = 1000;
5557      
5558    /**
5559     */
5560    @Override
5561    public void run() {
5562      while (fsRunning) {
5563        writeLock();
5564        try {
5565          if (safeMode == null) { // Not in safe mode.
5566            break;
5567          }
5568          if (safeMode.canLeave()) {
5569            // Leave safe mode.
5570            safeMode.leave();
5571            smmthread = null;
5572            break;
5573          }
5574        } finally {
5575          writeUnlock();
5576        }
5577
5578        try {
5579          Thread.sleep(recheckInterval);
5580        } catch (InterruptedException ie) {
5581          // Ignored
5582        }
5583      }
5584      if (!fsRunning) {
5585        LOG.info("NameNode is being shutdown, exit SafeModeMonitor thread");
5586      }
5587    }
5588  }
5589    
5590  boolean setSafeMode(SafeModeAction action) throws IOException {
5591    if (action != SafeModeAction.SAFEMODE_GET) {
5592      checkSuperuserPrivilege();
5593      switch(action) {
5594      case SAFEMODE_LEAVE: // leave safe mode
5595        leaveSafeMode();
5596        break;
5597      case SAFEMODE_ENTER: // enter safe mode
5598        enterSafeMode(false);
5599        break;
5600      default:
5601        LOG.error("Unexpected safe mode action");
5602      }
5603    }
5604    return isInSafeMode();
5605  }
5606
5607  @Override
5608  public void checkSafeMode() {
5609    // safeMode is volatile, and may be set to null at any time
5610    SafeModeInfo safeMode = this.safeMode;
5611    if (safeMode != null) {
5612      safeMode.checkMode();
5613    }
5614  }
5615
5616  @Override
5617  public boolean isInSafeMode() {
5618    // safeMode is volatile, and may be set to null at any time
5619    SafeModeInfo safeMode = this.safeMode;
5620    if (safeMode == null)
5621      return false;
5622    return safeMode.isOn();
5623  }
5624
5625  @Override
5626  public boolean isInStartupSafeMode() {
5627    // safeMode is volatile, and may be set to null at any time
5628    SafeModeInfo safeMode = this.safeMode;
5629    if (safeMode == null)
5630      return false;
5631    // If the NN is in safemode, and not due to manual / low resources, we
5632    // assume it must be because of startup. If the NN had low resources during
5633    // startup, we assume it came out of startup safemode and it is now in low
5634    // resources safemode
5635    return !safeMode.isManual() && !safeMode.areResourcesLow()
5636      && safeMode.isOn();
5637  }
5638
5639  /**
5640   * Check if replication queues are to be populated
5641   * @return true when node is HAState.Active and not in the very first safemode
5642   */
5643  @Override
5644  public boolean isPopulatingReplQueues() {
5645    if (!shouldPopulateReplQueues()) {
5646      return false;
5647    }
5648    return initializedReplQueues;
5649  }
5650
5651  private boolean shouldPopulateReplQueues() {
5652    if(haContext == null || haContext.getState() == null)
5653      return false;
5654    return haContext.getState().shouldPopulateReplQueues();
5655  }
5656
5657  @Override
5658  public void incrementSafeBlockCount(int replication) {
5659    // safeMode is volatile, and may be set to null at any time
5660    SafeModeInfo safeMode = this.safeMode;
5661    if (safeMode == null)
5662      return;
5663    safeMode.incrementSafeBlockCount((short)replication);
5664  }
5665
5666  @Override
5667  public void decrementSafeBlockCount(Block b) {
5668    // safeMode is volatile, and may be set to null at any time
5669    SafeModeInfo safeMode = this.safeMode;
5670    if (safeMode == null) // mostly true
5671      return;
5672    BlockInfoContiguous storedBlock = getStoredBlock(b);
5673    if (storedBlock.isComplete()) {
5674      safeMode.decrementSafeBlockCount((short)blockManager.countNodes(b).liveReplicas());
5675    }
5676  }
5677  
5678  /**
5679   * Adjust the total number of blocks safe and expected during safe mode.
5680   * If safe mode is not currently on, this is a no-op.
5681   * @param deltaSafe the change in number of safe blocks
5682   * @param deltaTotal the change i nnumber of total blocks expected
5683   */
5684  @Override
5685  public void adjustSafeModeBlockTotals(int deltaSafe, int deltaTotal) {
5686    // safeMode is volatile, and may be set to null at any time
5687    SafeModeInfo safeMode = this.safeMode;
5688    if (safeMode == null)
5689      return;
5690    safeMode.adjustBlockTotals(deltaSafe, deltaTotal);
5691  }
5692
5693  /**
5694   * Set the total number of blocks in the system. 
5695   */
5696  public void setBlockTotal() {
5697    // safeMode is volatile, and may be set to null at any time
5698    SafeModeInfo safeMode = this.safeMode;
5699    if (safeMode == null)
5700      return;
5701    safeMode.setBlockTotal((int)getCompleteBlocksTotal());
5702  }
5703
5704  /**
5705   * Get the total number of blocks in the system. 
5706   */
5707  @Override // FSNamesystemMBean
5708  @Metric
5709  public long getBlocksTotal() {
5710    return blockManager.getTotalBlocks();
5711  }
5712
5713  /**
5714   * Get the total number of COMPLETE blocks in the system.
5715   * For safe mode only complete blocks are counted.
5716   */
5717  private long getCompleteBlocksTotal() {
5718    // Calculate number of blocks under construction
5719    long numUCBlocks = 0;
5720    readLock();
5721    numUCBlocks = leaseManager.getNumUnderConstructionBlocks();
5722    try {
5723      return getBlocksTotal() - numUCBlocks;
5724    } finally {
5725      readUnlock("getCompleteBlocksTotal");
5726    }
5727  }
5728
5729  /**
5730   * Enter safe mode. If resourcesLow is false, then we assume it is manual
5731   * @throws IOException
5732   */
5733  void enterSafeMode(boolean resourcesLow) throws IOException {
5734    writeLock();
5735    try {
5736      // Stop the secret manager, since rolling the master key would
5737      // try to write to the edit log
5738      stopSecretManager();
5739
5740      // Ensure that any concurrent operations have been fully synced
5741      // before entering safe mode. This ensures that the FSImage
5742      // is entirely stable on disk as soon as we're in safe mode.
5743      boolean isEditlogOpenForWrite = getEditLog().isOpenForWrite();
5744      // Before Editlog is in OpenForWrite mode, editLogStream will be null. So,
5745      // logSyncAll call can be called only when Edlitlog is in OpenForWrite mode
5746      if (isEditlogOpenForWrite) {
5747        getEditLog().logSyncAll();
5748      }
5749      if (!isInSafeMode()) {
5750        safeMode = new SafeModeInfo(resourcesLow);
5751        return;
5752      }
5753      if (resourcesLow) {
5754        safeMode.setResourcesLow();
5755      } else {
5756        safeMode.setManual();
5757      }
5758      if (isEditlogOpenForWrite) {
5759        getEditLog().logSyncAll();
5760      }
5761      NameNode.stateChangeLog.info("STATE* Safe mode is ON"
5762          + safeMode.getTurnOffTip());
5763    } finally {
5764      writeUnlock("enterSafeMode");
5765    }
5766  }
5767
5768  /**
5769   * Leave safe mode.
5770   */
5771  void leaveSafeMode() {
5772    writeLock();
5773    try {
5774      if (!isInSafeMode()) {
5775        NameNode.stateChangeLog.info("STATE* Safe mode is already OFF"); 
5776        return;
5777      }
5778      safeMode.leave();
5779    } finally {
5780      writeUnlock("leaveSafeMode");
5781    }
5782  }
5783    
5784  String getSafeModeTip() {
5785    // There is no need to take readLock.
5786    // Don't use isInSafeMode as this.safeMode might be set to null.
5787    // after isInSafeMode returns.
5788    boolean inSafeMode;
5789    SafeModeInfo safeMode = this.safeMode;
5790    if (safeMode == null) {
5791      inSafeMode = false;
5792    } else {
5793      inSafeMode = safeMode.isOn();
5794    }
5795
5796    if (!inSafeMode) {
5797      return "";
5798    } else {
5799      return safeMode.getTurnOffTip();
5800    }
5801  }
5802
5803  CheckpointSignature rollEditLog() throws IOException {
5804    checkSuperuserPrivilege();
5805    checkOperation(OperationCategory.JOURNAL);
5806    writeLock();
5807    try {
5808      checkOperation(OperationCategory.JOURNAL);
5809      checkNameNodeSafeMode("Log not rolled");
5810      if (Server.isRpcInvocation()) {
5811        LOG.info("Roll Edit Log from " + Server.getRemoteAddress());
5812      }
5813      return getFSImage().rollEditLog();
5814    } finally {
5815      writeUnlock("rollEditLog");
5816    }
5817  }
5818
5819  NamenodeCommand startCheckpoint(NamenodeRegistration backupNode,
5820      NamenodeRegistration activeNamenode) throws IOException {
5821    checkOperation(OperationCategory.CHECKPOINT);
5822    writeLock();
5823    try {
5824      checkOperation(OperationCategory.CHECKPOINT);
5825      checkNameNodeSafeMode("Checkpoint not started");
5826      
5827      LOG.info("Start checkpoint for " + backupNode.getAddress());
5828      NamenodeCommand cmd = getFSImage().startCheckpoint(backupNode,
5829          activeNamenode);
5830      getEditLog().logSync();
5831      return cmd;
5832    } finally {
5833      writeUnlock("startCheckpoint");
5834    }
5835  }
5836
5837  public void processIncrementalBlockReport(final DatanodeID nodeID,
5838      final StorageReceivedDeletedBlocks srdb)
5839      throws IOException {
5840    writeLock();
5841    try {
5842      blockManager.processIncrementalBlockReport(nodeID, srdb);
5843    } finally {
5844      writeUnlock("processIncrementalBlockReport");
5845    }
5846  }
5847  
5848  void endCheckpoint(NamenodeRegistration registration,
5849                            CheckpointSignature sig) throws IOException {
5850    checkOperation(OperationCategory.CHECKPOINT);
5851    readLock();
5852    try {
5853      checkOperation(OperationCategory.CHECKPOINT);
5854      checkNameNodeSafeMode("Checkpoint not ended");
5855      LOG.info("End checkpoint for " + registration.getAddress());
5856      getFSImage().endCheckpoint(sig);
5857    } finally {
5858      readUnlock("endCheckpoint");
5859    }
5860  }
5861
5862  PermissionStatus createFsOwnerPermissions(FsPermission permission) {
5863    return new PermissionStatus(fsOwner.getShortUserName(), supergroup, permission);
5864  }
5865
5866  private void checkUnreadableBySuperuser(FSPermissionChecker pc,
5867      INode inode, int snapshotId)
5868      throws IOException {
5869    if (pc.isSuperUser()) {
5870      for (XAttr xattr : FSDirXAttrOp.getXAttrs(dir, inode, snapshotId)) {
5871        if (XAttrHelper.getPrefixName(xattr).
5872            equals(SECURITY_XATTR_UNREADABLE_BY_SUPERUSER)) {
5873          throw new AccessControlException("Access is denied for " +
5874              pc.getUser() + " since the superuser is not allowed to " +
5875              "perform this operation.");
5876        }
5877      }
5878    }
5879  }
5880
5881  @Override
5882  public void checkSuperuserPrivilege()
5883      throws AccessControlException {
5884    if (isPermissionEnabled) {
5885      FSPermissionChecker pc = getPermissionChecker();
5886      pc.checkSuperuserPrivilege();
5887    }
5888  }
5889
5890  /**
5891   * Check to see if we have exceeded the limit on the number
5892   * of inodes.
5893   */
5894  void checkFsObjectLimit() throws IOException {
5895    if (maxFsObjects != 0 &&
5896        maxFsObjects <= dir.totalInodes() + getBlocksTotal()) {
5897      throw new IOException("Exceeded the configured number of objects " +
5898                             maxFsObjects + " in the filesystem.");
5899    }
5900  }
5901
5902  /**
5903   * Get the total number of objects in the system. 
5904   */
5905  @Override // FSNamesystemMBean
5906  public long getMaxObjects() {
5907    return maxFsObjects;
5908  }
5909
5910  @Override // FSNamesystemMBean
5911  @Metric
5912  public long getFilesTotal() {
5913    // There is no need to take fSNamesystem's lock as
5914    // FSDirectory has its own lock.
5915    return this.dir.totalInodes();
5916  }
5917
5918  @Override // FSNamesystemMBean
5919  @Metric
5920  public long getPendingReplicationBlocks() {
5921    return blockManager.getPendingReplicationBlocksCount();
5922  }
5923
5924  @Override // FSNamesystemMBean
5925  @Metric
5926  public long getUnderReplicatedBlocks() {
5927    return blockManager.getUnderReplicatedBlocksCount();
5928  }
5929
5930  /** Returns number of blocks with corrupt replicas */
5931  @Metric({"CorruptBlocks", "Number of blocks with corrupt replicas"})
5932  public long getCorruptReplicaBlocks() {
5933    return blockManager.getCorruptReplicaBlocksCount();
5934  }
5935
5936  @Override // FSNamesystemMBean
5937  @Metric
5938  public long getScheduledReplicationBlocks() {
5939    return blockManager.getScheduledReplicationBlocksCount();
5940  }
5941
5942  @Override
5943  @Metric
5944  public long getPendingDeletionBlocks() {
5945    return blockManager.getPendingDeletionBlocksCount();
5946  }
5947
5948  @Override
5949  public long getBlockDeletionStartTime() {
5950    return startTime + blockManager.getStartupDelayBlockDeletionInMs();
5951  }
5952
5953  @Metric
5954  public long getExcessBlocks() {
5955    return blockManager.getExcessBlocksCount();
5956  }
5957  
5958  // HA-only metric
5959  @Metric
5960  public long getPostponedMisreplicatedBlocks() {
5961    return blockManager.getPostponedMisreplicatedBlocksCount();
5962  }
5963
5964  // HA-only metric
5965  @Metric
5966  public int getPendingDataNodeMessageCount() {
5967    return blockManager.getPendingDataNodeMessageCount();
5968  }
5969  
5970  // HA-only metric
5971  @Metric
5972  public String getHAState() {
5973    return haContext.getState().toString();
5974  }
5975
5976  // HA-only metric
5977  @Metric
5978  public long getMillisSinceLastLoadedEdits() {
5979    if (isInStandbyState() && editLogTailer != null) {
5980      return monotonicNow() - editLogTailer.getLastLoadTimeMs();
5981    } else {
5982      return 0;
5983    }
5984  }
5985  
5986  @Metric
5987  public int getBlockCapacity() {
5988    return blockManager.getCapacity();
5989  }
5990
5991  @Override // FSNamesystemMBean
5992  public String getFSState() {
5993    return isInSafeMode() ? "safeMode" : "Operational";
5994  }
5995  
5996  private ObjectName mbeanName;
5997  private ObjectName mxbeanName;
5998
5999  /**
6000   * Register the FSNamesystem MBean using the name
6001   *        "hadoop:service=NameNode,name=FSNamesystemState"
6002   */
6003  private void registerMBean() {
6004    // We can only implement one MXBean interface, so we keep the old one.
6005    try {
6006      StandardMBean bean = new StandardMBean(this, FSNamesystemMBean.class);
6007      mbeanName = MBeans.register("NameNode", "FSNamesystemState", bean);
6008    } catch (NotCompliantMBeanException e) {
6009      throw new RuntimeException("Bad MBean setup", e);
6010    }
6011
6012    LOG.info("Registered FSNamesystemState MBean");
6013  }
6014
6015  /**
6016   * shutdown FSNamesystem
6017   */
6018  void shutdown() {
6019    if (snapshotManager != null) {
6020      snapshotManager.shutdown();
6021    }
6022    if (mbeanName != null) {
6023      MBeans.unregister(mbeanName);
6024      mbeanName = null;
6025    }
6026    if (mxbeanName != null) {
6027      MBeans.unregister(mxbeanName);
6028      mxbeanName = null;
6029    }
6030    if (dir != null) {
6031      dir.shutdown();
6032    }
6033    if (blockManager != null) {
6034      blockManager.shutdown();
6035    }
6036  }
6037
6038  @Override // FSNamesystemMBean
6039  @Metric({"NumLiveDataNodes", "Number of datanodes which are currently live"})
6040  public int getNumLiveDataNodes() {
6041    return getBlockManager().getDatanodeManager().getNumLiveDataNodes();
6042  }
6043
6044  @Override // FSNamesystemMBean
6045  @Metric({"NumDeadDataNodes", "Number of datanodes which are currently dead"})
6046  public int getNumDeadDataNodes() {
6047    return getBlockManager().getDatanodeManager().getNumDeadDataNodes();
6048  }
6049  
6050  @Override // FSNamesystemMBean
6051  @Metric({"NumDecomLiveDataNodes",
6052      "Number of datanodes which have been decommissioned and are now live"})
6053  public int getNumDecomLiveDataNodes() {
6054    final List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
6055    getBlockManager().getDatanodeManager().fetchDatanodes(live, null, false);
6056    int liveDecommissioned = 0;
6057    for (DatanodeDescriptor node : live) {
6058      liveDecommissioned += node.isDecommissioned() ? 1 : 0;
6059    }
6060    return liveDecommissioned;
6061  }
6062
6063  @Override // FSNamesystemMBean
6064  @Metric({"NumDecomDeadDataNodes",
6065      "Number of datanodes which have been decommissioned and are now dead"})
6066  public int getNumDecomDeadDataNodes() {
6067    final List<DatanodeDescriptor> dead = new ArrayList<DatanodeDescriptor>();
6068    getBlockManager().getDatanodeManager().fetchDatanodes(null, dead, false);
6069    int deadDecommissioned = 0;
6070    for (DatanodeDescriptor node : dead) {
6071      deadDecommissioned += node.isDecommissioned() ? 1 : 0;
6072    }
6073    return deadDecommissioned;
6074  }
6075
6076  @Override // FSNamesystemMBean
6077  @Metric({"VolumeFailuresTotal",
6078      "Total number of volume failures across all Datanodes"})
6079  public int getVolumeFailuresTotal() {
6080    List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
6081    getBlockManager().getDatanodeManager().fetchDatanodes(live, null, false);
6082    int volumeFailuresTotal = 0;
6083    for (DatanodeDescriptor node: live) {
6084      volumeFailuresTotal += node.getVolumeFailures();
6085    }
6086    return volumeFailuresTotal;
6087  }
6088
6089  @Override // FSNamesystemMBean
6090  @Metric({"EstimatedCapacityLostTotal",
6091      "An estimate of the total capacity lost due to volume failures"})
6092  public long getEstimatedCapacityLostTotal() {
6093    List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
6094    getBlockManager().getDatanodeManager().fetchDatanodes(live, null, false);
6095    long estimatedCapacityLostTotal = 0;
6096    for (DatanodeDescriptor node: live) {
6097      VolumeFailureSummary volumeFailureSummary = node.getVolumeFailureSummary();
6098      if (volumeFailureSummary != null) {
6099        estimatedCapacityLostTotal +=
6100            volumeFailureSummary.getEstimatedCapacityLostTotal();
6101      }
6102    }
6103    return estimatedCapacityLostTotal;
6104  }
6105
6106  @Override // FSNamesystemMBean
6107  @Metric({"NumDecommissioningDataNodes",
6108      "Number of datanodes in decommissioning state"})
6109  public int getNumDecommissioningDataNodes() {
6110    return getBlockManager().getDatanodeManager().getDecommissioningNodes()
6111        .size();
6112  }
6113
6114  @Override // FSNamesystemMBean
6115  @Metric({"StaleDataNodes", 
6116    "Number of datanodes marked stale due to delayed heartbeat"})
6117  public int getNumStaleDataNodes() {
6118    return getBlockManager().getDatanodeManager().getNumStaleNodes();
6119  }
6120
6121  /**
6122   * Storages are marked as "content stale" after NN restart or fails over and
6123   * before NN receives the first Heartbeat followed by the first Blockreport.
6124   */
6125  @Override // FSNamesystemMBean
6126  @Metric({"NumStaleStorages",
6127      "Number of storages marked as content stale"})
6128  public int getNumStaleStorages() {
6129    return getBlockManager().getDatanodeManager().getNumStaleStorages();
6130  }
6131
6132  @Override // FSNamesystemMBean
6133  public String getTopUserOpCounts() {
6134    if (!topConf.isEnabled) {
6135      return null;
6136    }
6137
6138    Date now = new Date();
6139    final List<RollingWindowManager.TopWindow> topWindows =
6140        topMetrics.getTopWindows();
6141    Map<String, Object> topMap = new TreeMap<String, Object>();
6142    topMap.put("windows", topWindows);
6143    topMap.put("timestamp", DFSUtil.dateToIso8601String(now));
6144    ObjectMapper mapper = new ObjectMapper();
6145    try {
6146      return mapper.writeValueAsString(topMap);
6147    } catch (IOException e) {
6148      LOG.warn("Failed to fetch TopUser metrics", e);
6149    }
6150    return null;
6151  }
6152
6153  /**
6154   * Increments, logs and then returns the stamp
6155   */
6156  long nextGenerationStamp(boolean legacyBlock)
6157      throws IOException, SafeModeException {
6158    assert hasWriteLock();
6159    checkNameNodeSafeMode("Cannot get next generation stamp");
6160
6161    long gs = blockIdManager.nextGenerationStamp(legacyBlock);
6162    if (legacyBlock) {
6163      getEditLog().logGenerationStampV1(gs);
6164    } else {
6165      getEditLog().logGenerationStampV2(gs);
6166    }
6167
6168    // NB: callers sync the log
6169    return gs;
6170  }
6171
6172  /**
6173   * Increments, logs and then returns the block ID
6174   */
6175  private long nextBlockId() throws IOException {
6176    assert hasWriteLock();
6177    checkNameNodeSafeMode("Cannot get next block ID");
6178    final long blockId = blockIdManager.nextBlockId();
6179    getEditLog().logAllocateBlockId(blockId);
6180    // NB: callers sync the log
6181    return blockId;
6182  }
6183
6184  private boolean isFileDeleted(INodeFile file) {
6185    // Not in the inodeMap or in the snapshot but marked deleted.
6186    if (dir.getInode(file.getId()) == null) {
6187      return true;
6188    }
6189
6190    // look at the path hierarchy to see if one parent is deleted by recursive
6191    // deletion
6192    INode tmpChild = file;
6193    INodeDirectory tmpParent = file.getParent();
6194    while (true) {
6195      if (tmpParent == null) {
6196        return true;
6197      }
6198
6199      INode childINode = tmpParent.getChild(tmpChild.getLocalNameBytes(),
6200          Snapshot.CURRENT_STATE_ID);
6201      if (childINode == null || !childINode.equals(tmpChild)) {
6202        // a newly created INode with the same name as an already deleted one
6203        // would be a different INode than the deleted one
6204        return true;
6205      }
6206
6207      if (tmpParent.isRoot()) {
6208        break;
6209      }
6210
6211      tmpChild = tmpParent;
6212      tmpParent = tmpParent.getParent();
6213    }
6214
6215    if (file.isWithSnapshot() &&
6216        file.getFileWithSnapshotFeature().isCurrentFileDeleted()) {
6217      return true;
6218    }
6219    return false;
6220  }
6221
6222  private INodeFile checkUCBlock(ExtendedBlock block,
6223      String clientName) throws IOException {
6224    assert hasWriteLock();
6225    checkNameNodeSafeMode("Cannot get a new generation stamp and an "
6226        + "access token for block " + block);
6227    
6228    // check stored block state
6229    BlockInfoContiguous storedBlock = getStoredBlock(ExtendedBlock.getLocalBlock(block));
6230    if (storedBlock == null || 
6231        storedBlock.getBlockUCState() != BlockUCState.UNDER_CONSTRUCTION) {
6232        throw new IOException(block + 
6233            " does not exist or is not under Construction" + storedBlock);
6234    }
6235    
6236    // check file inode
6237    final INodeFile file = ((INode)storedBlock.getBlockCollection()).asFile();
6238    if (file == null || !file.isUnderConstruction() || isFileDeleted(file)) {
6239      throw new IOException("The file " + storedBlock + 
6240          " belonged to does not exist or it is not under construction.");
6241    }
6242    
6243    // check lease
6244    if (clientName == null
6245        || !clientName.equals(file.getFileUnderConstructionFeature()
6246            .getClientName())) {
6247      throw new LeaseExpiredException("Lease mismatch: " + block + 
6248          " is accessed by a non lease holder " + clientName); 
6249    }
6250
6251    return file;
6252  }
6253  
6254  /**
6255   * Client is reporting some bad block locations.
6256   */
6257  void reportBadBlocks(LocatedBlock[] blocks) throws IOException {
6258    checkOperation(OperationCategory.WRITE);
6259    writeLock();
6260    try {
6261      checkOperation(OperationCategory.WRITE);
6262      for (int i = 0; i < blocks.length; i++) {
6263        ExtendedBlock blk = blocks[i].getBlock();
6264        DatanodeInfo[] nodes = blocks[i].getLocations();
6265        String[] storageIDs = blocks[i].getStorageIDs();
6266        for (int j = 0; j < nodes.length; j++) {
6267          NameNode.stateChangeLog.info("*DIR* reportBadBlocks for block: {} on"
6268              + " datanode: {}", blk, nodes[j].getXferAddr());
6269          blockManager.findAndMarkBlockAsCorrupt(blk, nodes[j],
6270              storageIDs == null ? null: storageIDs[j], 
6271              "client machine reported it");
6272        }
6273      }
6274    } finally {
6275      writeUnlock("reportBadBlocks");
6276    }
6277  }
6278
6279  /**
6280   * Get a new generation stamp together with an access token for 
6281   * a block under construction
6282   * 
6283   * This method is called for recovering a failed pipeline or setting up
6284   * a pipeline to append to a block.
6285   * 
6286   * @param block a block
6287   * @param clientName the name of a client
6288   * @return a located block with a new generation stamp and an access token
6289   * @throws IOException if any error occurs
6290   */
6291  LocatedBlock updateBlockForPipeline(ExtendedBlock block, 
6292      String clientName) throws IOException {
6293    LocatedBlock locatedBlock;
6294    checkOperation(OperationCategory.WRITE);
6295    writeLock();
6296    try {
6297      checkOperation(OperationCategory.WRITE);
6298
6299      // check vadility of parameters
6300      checkUCBlock(block, clientName);
6301  
6302      // get a new generation stamp and an access token
6303      block.setGenerationStamp(nextGenerationStamp(blockIdManager.isLegacyBlock(block.getLocalBlock())));
6304      locatedBlock = new LocatedBlock(block, new DatanodeInfo[0]);
6305      blockManager.setBlockToken(locatedBlock, AccessMode.WRITE);
6306    } finally {
6307      writeUnlock("bumpBlockGenerationStamp");
6308    }
6309    // Ensure we record the new generation stamp
6310    getEditLog().logSync();
6311    return locatedBlock;
6312  }
6313  
6314  /**
6315   * Update a pipeline for a block under construction
6316   * 
6317   * @param clientName the name of the client
6318   * @param oldBlock and old block
6319   * @param newBlock a new block with a new generation stamp and length
6320   * @param newNodes datanodes in the pipeline
6321   * @throws IOException if any error occurs
6322   */
6323  void updatePipeline(
6324      String clientName, ExtendedBlock oldBlock, ExtendedBlock newBlock,
6325      DatanodeID[] newNodes, String[] newStorageIDs, boolean logRetryCache)
6326      throws IOException {
6327    LOG.info("updatePipeline(" + oldBlock.getLocalBlock()
6328             + ", newGS=" + newBlock.getGenerationStamp()
6329             + ", newLength=" + newBlock.getNumBytes()
6330             + ", newNodes=" + Arrays.asList(newNodes)
6331             + ", client=" + clientName
6332             + ")");
6333    waitForLoadingFSImage();
6334    writeLock();
6335    try {
6336      checkOperation(OperationCategory.WRITE);
6337      checkNameNodeSafeMode("Pipeline not updated");
6338      assert newBlock.getBlockId()==oldBlock.getBlockId() : newBlock + " and "
6339        + oldBlock + " has different block identifier";
6340      updatePipelineInternal(clientName, oldBlock, newBlock, newNodes,
6341          newStorageIDs, logRetryCache);
6342    } finally {
6343      writeUnlock("updatePipeline");
6344    }
6345    getEditLog().logSync();
6346    LOG.info("updatePipeline(" + oldBlock.getLocalBlock() + " => "
6347        + newBlock.getLocalBlock() + ") success");
6348  }
6349
6350  private void updatePipelineInternal(String clientName, ExtendedBlock oldBlock,
6351      ExtendedBlock newBlock, DatanodeID[] newNodes, String[] newStorageIDs,
6352      boolean logRetryCache)
6353      throws IOException {
6354    assert hasWriteLock();
6355    // check the vadility of the block and lease holder name
6356    final INodeFile pendingFile = checkUCBlock(oldBlock, clientName);
6357    final String src = pendingFile.getFullPathName();
6358    final BlockInfoContiguousUnderConstruction blockinfo
6359        = (BlockInfoContiguousUnderConstruction)pendingFile.getLastBlock();
6360
6361    // check new GS & length: this is not expected
6362    if (newBlock.getGenerationStamp() <= blockinfo.getGenerationStamp() ||
6363        newBlock.getNumBytes() < blockinfo.getNumBytes()) {
6364      String msg = "Update " + oldBlock + " (len = " + 
6365        blockinfo.getNumBytes() + ") to an older state: " + newBlock + 
6366        " (len = " + newBlock.getNumBytes() +")";
6367      LOG.warn(msg);
6368      throw new IOException(msg);
6369    }
6370
6371    // Update old block with the new generation stamp and new length
6372    blockManager.updateLastBlock(blockinfo, newBlock);
6373
6374    // find the DatanodeDescriptor objects
6375    final DatanodeStorageInfo[] storages = blockManager.getDatanodeManager()
6376        .getDatanodeStorageInfos(newNodes, newStorageIDs,
6377            "src=%s, oldBlock=%s, newBlock=%s, clientName=%s",
6378            src, oldBlock, newBlock, clientName);
6379    blockinfo.setExpectedLocations(storages);
6380
6381    persistBlocks(src, pendingFile, logRetryCache);
6382  }
6383
6384  // rename was successful. If any part of the renamed subtree had
6385  // files that were being written to, update with new filename.
6386  void unprotectedChangeLease(String src, String dst) {
6387    assert hasWriteLock();
6388    leaseManager.changeLease(src, dst);
6389  }
6390
6391  /**
6392   * Serializes leases.
6393   */
6394  void saveFilesUnderConstruction(DataOutputStream out,
6395      Map<Long, INodeFile> snapshotUCMap) throws IOException {
6396    // This is run by an inferior thread of saveNamespace, which holds a read
6397    // lock on our behalf. If we took the read lock here, we could block
6398    // for fairness if a writer is waiting on the lock.
6399    synchronized (leaseManager) {
6400      Map<String, INodeFile> nodes = leaseManager.getINodesUnderConstruction();
6401      for (Map.Entry<String, INodeFile> entry : nodes.entrySet()) {
6402        // TODO: for HDFS-5428, because of rename operations, some
6403        // under-construction files that are
6404        // in the current fs directory can also be captured in the
6405        // snapshotUCMap. We should remove them from the snapshotUCMap.
6406        snapshotUCMap.remove(entry.getValue().getId());
6407      }
6408
6409      out.writeInt(nodes.size() + snapshotUCMap.size()); // write the size
6410      for (Map.Entry<String, INodeFile> entry : nodes.entrySet()) {
6411        FSImageSerialization.writeINodeUnderConstruction(
6412            out, entry.getValue(), entry.getKey());
6413      }
6414      for (Map.Entry<Long, INodeFile> entry : snapshotUCMap.entrySet()) {
6415        // for those snapshot INodeFileUC, we use "/.reserved/.inodes/<inodeid>"
6416        // as their paths
6417        StringBuilder b = new StringBuilder();
6418        b.append(FSDirectory.DOT_RESERVED_PATH_PREFIX)
6419            .append(Path.SEPARATOR).append(FSDirectory.DOT_INODES_STRING)
6420            .append(Path.SEPARATOR).append(entry.getValue().getId());
6421        FSImageSerialization.writeINodeUnderConstruction(
6422            out, entry.getValue(), b.toString());
6423      }
6424    }
6425  }
6426
6427  /**
6428   * @return all the under-construction files in the lease map
6429   */
6430  Map<String, INodeFile> getFilesUnderConstruction() {
6431    synchronized (leaseManager) {
6432      return leaseManager.getINodesUnderConstruction();
6433    }
6434  }
6435
6436  /**
6437   * Register a Backup name-node, verifying that it belongs
6438   * to the correct namespace, and adding it to the set of
6439   * active journals if necessary.
6440   * 
6441   * @param bnReg registration of the new BackupNode
6442   * @param nnReg registration of this NameNode
6443   * @throws IOException if the namespace IDs do not match
6444   */
6445  void registerBackupNode(NamenodeRegistration bnReg,
6446      NamenodeRegistration nnReg) throws IOException {
6447    writeLock();
6448    try {
6449      if(getFSImage().getStorage().getNamespaceID() 
6450         != bnReg.getNamespaceID())
6451        throw new IOException("Incompatible namespaceIDs: "
6452            + " Namenode namespaceID = "
6453            + getFSImage().getStorage().getNamespaceID() + "; "
6454            + bnReg.getRole() +
6455            " node namespaceID = " + bnReg.getNamespaceID());
6456      if (bnReg.getRole() == NamenodeRole.BACKUP) {
6457        getFSImage().getEditLog().registerBackupNode(
6458            bnReg, nnReg);
6459      }
6460    } finally {
6461      writeUnlock("registerBackupNode");
6462    }
6463  }
6464
6465  /**
6466   * Release (unregister) backup node.
6467   * <p>
6468   * Find and remove the backup stream corresponding to the node.
6469   * @throws IOException
6470   */
6471  void releaseBackupNode(NamenodeRegistration registration)
6472    throws IOException {
6473    checkOperation(OperationCategory.WRITE);
6474    writeLock();
6475    try {
6476      checkOperation(OperationCategory.WRITE);
6477      if(getFSImage().getStorage().getNamespaceID()
6478         != registration.getNamespaceID())
6479        throw new IOException("Incompatible namespaceIDs: "
6480            + " Namenode namespaceID = "
6481            + getFSImage().getStorage().getNamespaceID() + "; "
6482            + registration.getRole() +
6483            " node namespaceID = " + registration.getNamespaceID());
6484      getEditLog().releaseBackupStream(registration);
6485    } finally {
6486      writeUnlock("releaseBackupNode");
6487    }
6488  }
6489
6490  static class CorruptFileBlockInfo {
6491    final String path;
6492    final Block block;
6493    
6494    public CorruptFileBlockInfo(String p, Block b) {
6495      path = p;
6496      block = b;
6497    }
6498    
6499    @Override
6500    public String toString() {
6501      return block.getBlockName() + "\t" + path;
6502    }
6503  }
6504  /**
6505   * @param path Restrict corrupt files to this portion of namespace.
6506   * @param cookieTab Support for continuation; cookieTab  tells where
6507   *                  to start from
6508   * @return a list in which each entry describes a corrupt file/block
6509   * @throws IOException
6510   */
6511  Collection<CorruptFileBlockInfo> listCorruptFileBlocks(String path,
6512  String[] cookieTab) throws IOException {
6513    checkSuperuserPrivilege();
6514    checkOperation(OperationCategory.READ);
6515
6516    int count = 0;
6517    ArrayList<CorruptFileBlockInfo> corruptFiles =
6518        new ArrayList<CorruptFileBlockInfo>();
6519    if (cookieTab == null) {
6520      cookieTab = new String[] { null };
6521    }
6522
6523    // Do a quick check if there are any corrupt files without taking the lock
6524    if (blockManager.getMissingBlocksCount() == 0) {
6525      if (cookieTab[0] == null) {
6526        cookieTab[0] = String.valueOf(getIntCookie(cookieTab[0]));
6527      }
6528      if (LOG.isDebugEnabled()) {
6529        LOG.debug("there are no corrupt file blocks.");
6530      }
6531      return corruptFiles;
6532    }
6533
6534    readLock();
6535    try {
6536      checkOperation(OperationCategory.READ);
6537      if (!isPopulatingReplQueues()) {
6538        throw new IOException("Cannot run listCorruptFileBlocks because " +
6539                              "replication queues have not been initialized.");
6540      }
6541      // print a limited # of corrupt files per call
6542
6543      final Iterator<Block> blkIterator = blockManager.getCorruptReplicaBlockIterator();
6544
6545      int skip = getIntCookie(cookieTab[0]);
6546      for (int i = 0; i < skip && blkIterator.hasNext(); i++) {
6547        blkIterator.next();
6548      }
6549
6550      while (blkIterator.hasNext()) {
6551        Block blk = blkIterator.next();
6552        final INode inode = (INode)blockManager.getBlockCollection(blk);
6553        skip++;
6554        if (inode != null && blockManager.countNodes(blk).liveReplicas() == 0) {
6555          String src = inode.getFullPathName();
6556          if (src.startsWith(path)){
6557            corruptFiles.add(new CorruptFileBlockInfo(src, blk));
6558            count++;
6559            if (count >= DEFAULT_MAX_CORRUPT_FILEBLOCKS_RETURNED)
6560              break;
6561          }
6562        }
6563      }
6564      cookieTab[0] = String.valueOf(skip);
6565      if (LOG.isDebugEnabled()) {
6566        LOG.debug("list corrupt file blocks returned: " + count);
6567      }
6568      return corruptFiles;
6569    } finally {
6570      readUnlock("listCorruptFileBlocks");
6571    }
6572  }
6573
6574  /**
6575   * Convert string cookie to integer.
6576   */
6577  private static int getIntCookie(String cookie){
6578    int c;
6579    if(cookie == null){
6580      c = 0;
6581    } else {
6582      try{
6583        c = Integer.parseInt(cookie);
6584      }catch (NumberFormatException e) {
6585        c = 0;
6586      }
6587    }
6588    c = Math.max(0, c);
6589    return c;
6590  }
6591
6592  /**
6593   * Create delegation token secret manager
6594   */
6595  private DelegationTokenSecretManager createDelegationTokenSecretManager(
6596      Configuration conf) {
6597    return new DelegationTokenSecretManager(conf.getLong(
6598        DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_KEY,
6599        DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_DEFAULT),
6600        conf.getLong(DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_KEY,
6601            DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_DEFAULT),
6602        conf.getLong(DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_KEY,
6603            DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_DEFAULT),
6604        DELEGATION_TOKEN_REMOVER_SCAN_INTERVAL,
6605        conf.getBoolean(DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_KEY,
6606            DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_DEFAULT),
6607        this);
6608  }
6609
6610  /**
6611   * Returns the DelegationTokenSecretManager instance in the namesystem.
6612   * @return delegation token secret manager object
6613   */
6614  DelegationTokenSecretManager getDelegationTokenSecretManager() {
6615    return dtSecretManager;
6616  }
6617
6618  /**
6619   * @param renewer Renewer information
6620   * @return delegation toek
6621   * @throws IOException on error
6622   */
6623  Token<DelegationTokenIdentifier> getDelegationToken(Text renewer)
6624      throws IOException {
6625    Token<DelegationTokenIdentifier> token;
6626    checkOperation(OperationCategory.WRITE);
6627    writeLock();
6628    try {
6629      checkOperation(OperationCategory.WRITE);
6630      checkNameNodeSafeMode("Cannot issue delegation token");
6631      if (!isAllowedDelegationTokenOp()) {
6632        throw new IOException(
6633          "Delegation Token can be issued only with kerberos or web authentication");
6634      }
6635      if (dtSecretManager == null || !dtSecretManager.isRunning()) {
6636        LOG.warn("trying to get DT with no secret manager running");
6637        return null;
6638      }
6639
6640      UserGroupInformation ugi = getRemoteUser();
6641      String user = ugi.getUserName();
6642      Text owner = new Text(user);
6643      Text realUser = null;
6644      if (ugi.getRealUser() != null) {
6645        realUser = new Text(ugi.getRealUser().getUserName());
6646      }
6647      DelegationTokenIdentifier dtId = new DelegationTokenIdentifier(owner,
6648        renewer, realUser);
6649      token = new Token<DelegationTokenIdentifier>(
6650        dtId, dtSecretManager);
6651      long expiryTime = dtSecretManager.getTokenExpiryTime(dtId);
6652      getEditLog().logGetDelegationToken(dtId, expiryTime);
6653    } finally {
6654      writeUnlock("getDelegationToken");
6655    }
6656    getEditLog().logSync();
6657    return token;
6658  }
6659
6660  /**
6661   * 
6662   * @param token token to renew
6663   * @return new expiryTime of the token
6664   * @throws InvalidToken if {@code token} is invalid
6665   * @throws IOException on other errors
6666   */
6667  long renewDelegationToken(Token<DelegationTokenIdentifier> token)
6668      throws InvalidToken, IOException {
6669    long expiryTime;
6670    checkOperation(OperationCategory.WRITE);
6671    writeLock();
6672    try {
6673      checkOperation(OperationCategory.WRITE);
6674
6675      checkNameNodeSafeMode("Cannot renew delegation token");
6676      if (!isAllowedDelegationTokenOp()) {
6677        throw new IOException(
6678            "Delegation Token can be renewed only with kerberos or web authentication");
6679      }
6680      String renewer = getRemoteUser().getShortUserName();
6681      expiryTime = dtSecretManager.renewToken(token, renewer);
6682      DelegationTokenIdentifier id = new DelegationTokenIdentifier();
6683      ByteArrayInputStream buf = new ByteArrayInputStream(token.getIdentifier());
6684      DataInputStream in = new DataInputStream(buf);
6685      id.readFields(in);
6686      getEditLog().logRenewDelegationToken(id, expiryTime);
6687    } finally {
6688      writeUnlock("renewDelegationToken");
6689    }
6690    getEditLog().logSync();
6691    return expiryTime;
6692  }
6693
6694  /**
6695   * 
6696   * @param token token to cancel
6697   * @throws IOException on error
6698   */
6699  void cancelDelegationToken(Token<DelegationTokenIdentifier> token)
6700      throws IOException {
6701    checkOperation(OperationCategory.WRITE);
6702    writeLock();
6703    try {
6704      checkOperation(OperationCategory.WRITE);
6705
6706      checkNameNodeSafeMode("Cannot cancel delegation token");
6707      String canceller = getRemoteUser().getUserName();
6708      DelegationTokenIdentifier id = dtSecretManager
6709        .cancelToken(token, canceller);
6710      getEditLog().logCancelDelegationToken(id);
6711    } finally {
6712      writeUnlock("cancelDelegationToken");
6713    }
6714    getEditLog().logSync();
6715  }
6716
6717  /**
6718   * @param out save state of the secret manager
6719   * @param sdPath String storage directory path
6720   */
6721  void saveSecretManagerStateCompat(DataOutputStream out, String sdPath)
6722      throws IOException {
6723    dtSecretManager.saveSecretManagerStateCompat(out, sdPath);
6724  }
6725
6726  SecretManagerState saveSecretManagerState() {
6727    return dtSecretManager.saveSecretManagerState();
6728  }
6729
6730  /**
6731   * @param in load the state of secret manager from input stream
6732   */
6733  void loadSecretManagerStateCompat(DataInput in) throws IOException {
6734    dtSecretManager.loadSecretManagerStateCompat(in);
6735  }
6736
6737  void loadSecretManagerState(SecretManagerSection s,
6738      List<SecretManagerSection.DelegationKey> keys,
6739      List<SecretManagerSection.PersistToken> tokens) throws IOException {
6740    dtSecretManager.loadSecretManagerState(new SecretManagerState(s, keys, tokens));
6741  }
6742
6743  /**
6744   * Log the updateMasterKey operation to edit logs
6745   * 
6746   * @param key new delegation key.
6747   */
6748  public void logUpdateMasterKey(DelegationKey key) {
6749    
6750    assert !isInSafeMode() :
6751      "this should never be called while in safemode, since we stop " +
6752      "the DT manager before entering safemode!";
6753    // No need to hold FSN lock since we don't access any internal
6754    // structures, and this is stopped before the FSN shuts itself
6755    // down, etc.
6756    getEditLog().logUpdateMasterKey(key);
6757    getEditLog().logSync();
6758  }
6759  
6760  /**
6761   * Log the cancellation of expired tokens to edit logs
6762   * 
6763   * @param id token identifier to cancel
6764   */
6765  public void logExpireDelegationToken(DelegationTokenIdentifier id) {
6766    assert !isInSafeMode() :
6767      "this should never be called while in safemode, since we stop " +
6768      "the DT manager before entering safemode!";
6769    // No need to hold FSN lock since we don't access any internal
6770    // structures, and this is stopped before the FSN shuts itself
6771    // down, etc.
6772    getEditLog().logCancelDelegationToken(id);
6773  }  
6774  
6775  private void logReassignLease(String leaseHolder, String src,
6776      String newHolder) {
6777    assert hasWriteLock();
6778    getEditLog().logReassignLease(leaseHolder, src, newHolder);
6779  }
6780  
6781  /**
6782   * 
6783   * @return true if delegation token operation is allowed
6784   */
6785  private boolean isAllowedDelegationTokenOp() throws IOException {
6786    AuthenticationMethod authMethod = getConnectionAuthenticationMethod();
6787    if (UserGroupInformation.isSecurityEnabled()
6788        && (authMethod != AuthenticationMethod.KERBEROS)
6789        && (authMethod != AuthenticationMethod.KERBEROS_SSL)
6790        && (authMethod != AuthenticationMethod.CERTIFICATE)) {
6791      return false;
6792    }
6793    return true;
6794  }
6795  
6796  /**
6797   * Returns authentication method used to establish the connection
6798   * @return AuthenticationMethod used to establish connection
6799   * @throws IOException
6800   */
6801  private AuthenticationMethod getConnectionAuthenticationMethod()
6802      throws IOException {
6803    UserGroupInformation ugi = getRemoteUser();
6804    AuthenticationMethod authMethod = ugi.getAuthenticationMethod();
6805    if (authMethod == AuthenticationMethod.PROXY) {
6806      authMethod = ugi.getRealUser().getAuthenticationMethod();
6807    }
6808    return authMethod;
6809  }
6810  
6811  /**
6812   * Client invoked methods are invoked over RPC and will be in 
6813   * RPC call context even if the client exits.
6814   */
6815  boolean isExternalInvocation() {
6816    return Server.isRpcInvocation() || NamenodeWebHdfsMethods.isWebHdfsInvocation();
6817  }
6818
6819  private static InetAddress getRemoteIp() {
6820    InetAddress ip = Server.getRemoteIp();
6821    if (ip != null) {
6822      return ip;
6823    }
6824    return NamenodeWebHdfsMethods.getRemoteIp();
6825  }
6826  
6827  // optimize ugi lookup for RPC operations to avoid a trip through
6828  // UGI.getCurrentUser which is synch'ed
6829  private static UserGroupInformation getRemoteUser() throws IOException {
6830    return NameNode.getRemoteUser();
6831  }
6832  
6833  /**
6834   * Log fsck event in the audit log 
6835   */
6836  void logFsckEvent(String src, InetAddress remoteAddress) throws IOException {
6837    if (isAuditEnabled()) {
6838      logAuditEvent(true, getRemoteUser(),
6839                    remoteAddress,
6840                    "fsck", src, null, null);
6841    }
6842  }
6843  /**
6844   * Register NameNodeMXBean
6845   */
6846  private void registerMXBean() {
6847    mxbeanName = MBeans.register("NameNode", "NameNodeInfo", this);
6848  }
6849
6850  /**
6851   * Class representing Namenode information for JMX interfaces
6852   */
6853  @Override // NameNodeMXBean
6854  public String getVersion() {
6855    return VersionInfo.getVersion() + ", r" + VersionInfo.getRevision();
6856  }
6857
6858  @Override // NameNodeMXBean
6859  public long getUsed() {
6860    return this.getCapacityUsed();
6861  }
6862
6863  @Override // NameNodeMXBean
6864  public long getFree() {
6865    return this.getCapacityRemaining();
6866  }
6867
6868  @Override // NameNodeMXBean
6869  public long getTotal() {
6870    return this.getCapacityTotal();
6871  }
6872
6873  @Override // NameNodeMXBean
6874  public String getSafemode() {
6875    if (!this.isInSafeMode())
6876      return "";
6877    return "Safe mode is ON. " + this.getSafeModeTip();
6878  }
6879
6880  @Override // NameNodeMXBean
6881  public boolean isUpgradeFinalized() {
6882    return this.getFSImage().isUpgradeFinalized();
6883  }
6884
6885  @Override // NameNodeMXBean
6886  public long getNonDfsUsedSpace() {
6887    return datanodeStatistics.getCapacityUsedNonDFS();
6888  }
6889
6890  @Override // NameNodeMXBean
6891  public float getPercentUsed() {
6892    return datanodeStatistics.getCapacityUsedPercent();
6893  }
6894
6895  @Override // NameNodeMXBean
6896  public long getBlockPoolUsedSpace() {
6897    return datanodeStatistics.getBlockPoolUsed();
6898  }
6899
6900  @Override // NameNodeMXBean
6901  public float getPercentBlockPoolUsed() {
6902    return datanodeStatistics.getPercentBlockPoolUsed();
6903  }
6904
6905  @Override // NameNodeMXBean
6906  public float getPercentRemaining() {
6907    return datanodeStatistics.getCapacityRemainingPercent();
6908  }
6909
6910  @Override // NameNodeMXBean
6911  public long getCacheCapacity() {
6912    return datanodeStatistics.getCacheCapacity();
6913  }
6914
6915  @Override // NameNodeMXBean
6916  public long getCacheUsed() {
6917    return datanodeStatistics.getCacheUsed();
6918  }
6919
6920  @Override // NameNodeMXBean
6921  public long getTotalBlocks() {
6922    return getBlocksTotal();
6923  }
6924
6925  @Override // NameNodeMXBean
6926  @Metric
6927  public long getTotalFiles() {
6928    return getFilesTotal();
6929  }
6930
6931  @Override // NameNodeMXBean
6932  public long getNumberOfMissingBlocks() {
6933    return getMissingBlocksCount();
6934  }
6935  
6936  @Override // NameNodeMXBean
6937  public long getNumberOfMissingBlocksWithReplicationFactorOne() {
6938    return getMissingReplOneBlocksCount();
6939  }
6940
6941  @Override // NameNodeMXBean
6942  public int getThreads() {
6943    return ManagementFactory.getThreadMXBean().getThreadCount();
6944  }
6945
6946  /**
6947   * Returned information is a JSON representation of map with host name as the
6948   * key and value is a map of live node attribute keys to its values
6949   */
6950  @Override // NameNodeMXBean
6951  public String getLiveNodes() {
6952    final Map<String, Map<String,Object>> info = 
6953      new HashMap<String, Map<String,Object>>();
6954    final List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
6955    blockManager.getDatanodeManager().fetchDatanodes(live, null, false);
6956    for (DatanodeDescriptor node : live) {
6957      ImmutableMap.Builder<String, Object> innerinfo =
6958          ImmutableMap.<String,Object>builder();
6959      innerinfo
6960          .put("infoAddr", node.getInfoAddr())
6961          .put("infoSecureAddr", node.getInfoSecureAddr())
6962          .put("xferaddr", node.getXferAddr())
6963          .put("lastContact", getLastContact(node))
6964          .put("usedSpace", getDfsUsed(node))
6965          .put("adminState", node.getAdminState().toString())
6966          .put("nonDfsUsedSpace", node.getNonDfsUsed())
6967          .put("capacity", node.getCapacity())
6968          .put("numBlocks", node.numBlocks())
6969          .put("version", node.getSoftwareVersion())
6970          .put("used", node.getDfsUsed())
6971          .put("remaining", node.getRemaining())
6972          .put("blockScheduled", node.getBlocksScheduled())
6973          .put("blockPoolUsed", node.getBlockPoolUsed())
6974          .put("blockPoolUsedPercent", node.getBlockPoolUsedPercent())
6975          .put("volfails", node.getVolumeFailures());
6976      VolumeFailureSummary volumeFailureSummary = node.getVolumeFailureSummary();
6977      if (volumeFailureSummary != null) {
6978        innerinfo
6979            .put("failedStorageLocations",
6980                volumeFailureSummary.getFailedStorageLocations())
6981            .put("lastVolumeFailureDate",
6982                volumeFailureSummary.getLastVolumeFailureDate())
6983            .put("estimatedCapacityLostTotal",
6984                volumeFailureSummary.getEstimatedCapacityLostTotal());
6985      }
6986      info.put(node.getHostName() + ":" + node.getXferPort(), innerinfo.build());
6987    }
6988    return JSON.toString(info);
6989  }
6990
6991  /**
6992   * Returned information is a JSON representation of map with host name as the
6993   * key and value is a map of dead node attribute keys to its values
6994   */
6995  @Override // NameNodeMXBean
6996  public String getDeadNodes() {
6997    final Map<String, Map<String, Object>> info = 
6998      new HashMap<String, Map<String, Object>>();
6999    final List<DatanodeDescriptor> dead = new ArrayList<DatanodeDescriptor>();
7000    blockManager.getDatanodeManager().fetchDatanodes(null, dead, false);
7001    for (DatanodeDescriptor node : dead) {
7002      Map<String, Object> innerinfo = ImmutableMap.<String, Object>builder()
7003          .put("lastContact", getLastContact(node))
7004          .put("decommissioned", node.isDecommissioned())
7005          .put("xferaddr", node.getXferAddr())
7006          .build();
7007      info.put(node.getHostName() + ":" + node.getXferPort(), innerinfo);
7008    }
7009    return JSON.toString(info);
7010  }
7011
7012  /**
7013   * Returned information is a JSON representation of map with host name as the
7014   * key and value is a map of decommissioning node attribute keys to its
7015   * values
7016   */
7017  @Override // NameNodeMXBean
7018  public String getDecomNodes() {
7019    final Map<String, Map<String, Object>> info = 
7020      new HashMap<String, Map<String, Object>>();
7021    final List<DatanodeDescriptor> decomNodeList = blockManager.getDatanodeManager(
7022        ).getDecommissioningNodes();
7023    for (DatanodeDescriptor node : decomNodeList) {
7024      Map<String, Object> innerinfo = ImmutableMap
7025          .<String, Object> builder()
7026          .put("xferaddr", node.getXferAddr())
7027          .put("underReplicatedBlocks",
7028              node.decommissioningStatus.getUnderReplicatedBlocks())
7029          .put("decommissionOnlyReplicas",
7030              node.decommissioningStatus.getDecommissionOnlyReplicas())
7031          .put("underReplicateInOpenFiles",
7032              node.decommissioningStatus.getUnderReplicatedInOpenFiles())
7033          .build();
7034      info.put(node.getHostName() + ":" + node.getXferPort(), innerinfo);
7035    }
7036    return JSON.toString(info);
7037  }
7038
7039  private long getLastContact(DatanodeDescriptor alivenode) {
7040    return (monotonicNow() - alivenode.getLastUpdateMonotonic())/1000;
7041  }
7042
7043  private long getDfsUsed(DatanodeDescriptor alivenode) {
7044    return alivenode.getDfsUsed();
7045  }
7046
7047  @Override  // NameNodeMXBean
7048  public String getClusterId() {
7049    return getFSImage().getStorage().getClusterID();
7050  }
7051  
7052  @Override  // NameNodeMXBean
7053  public String getBlockPoolId() {
7054    return blockPoolId;
7055  }
7056  
7057  @Override  // NameNodeMXBean
7058  public String getNameDirStatuses() {
7059    Map<String, Map<File, StorageDirType>> statusMap =
7060      new HashMap<String, Map<File, StorageDirType>>();
7061    
7062    Map<File, StorageDirType> activeDirs = new HashMap<File, StorageDirType>();
7063    for (Iterator<StorageDirectory> it
7064        = getFSImage().getStorage().dirIterator(); it.hasNext();) {
7065      StorageDirectory st = it.next();
7066      activeDirs.put(st.getRoot(), st.getStorageDirType());
7067    }
7068    statusMap.put("active", activeDirs);
7069    
7070    List<Storage.StorageDirectory> removedStorageDirs
7071        = getFSImage().getStorage().getRemovedStorageDirs();
7072    Map<File, StorageDirType> failedDirs = new HashMap<File, StorageDirType>();
7073    for (StorageDirectory st : removedStorageDirs) {
7074      failedDirs.put(st.getRoot(), st.getStorageDirType());
7075    }
7076    statusMap.put("failed", failedDirs);
7077    
7078    return JSON.toString(statusMap);
7079  }
7080
7081  @Override // NameNodeMXBean
7082  public String getNodeUsage() {
7083    float median = 0;
7084    float max = 0;
7085    float min = 0;
7086    float dev = 0;
7087
7088    final Map<String, Map<String,Object>> info =
7089        new HashMap<String, Map<String,Object>>();
7090    final List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
7091    blockManager.getDatanodeManager().fetchDatanodes(live, null, true);
7092
7093    if (live.size() > 0) {
7094      float totalDfsUsed = 0;
7095      float[] usages = new float[live.size()];
7096      int i = 0;
7097      for (DatanodeDescriptor dn : live) {
7098        usages[i++] = dn.getDfsUsedPercent();
7099        totalDfsUsed += dn.getDfsUsedPercent();
7100      }
7101      totalDfsUsed /= live.size();
7102      Arrays.sort(usages);
7103      median = usages[usages.length / 2];
7104      max = usages[usages.length - 1];
7105      min = usages[0];
7106
7107      for (i = 0; i < usages.length; i++) {
7108        dev += (usages[i] - totalDfsUsed) * (usages[i] - totalDfsUsed);
7109      }
7110      dev = (float) Math.sqrt(dev / usages.length);
7111    }
7112
7113    final Map<String, Object> innerInfo = new HashMap<String, Object>();
7114    innerInfo.put("min", StringUtils.format("%.2f%%", min));
7115    innerInfo.put("median", StringUtils.format("%.2f%%", median));
7116    innerInfo.put("max", StringUtils.format("%.2f%%", max));
7117    innerInfo.put("stdDev", StringUtils.format("%.2f%%", dev));
7118    info.put("nodeUsage", innerInfo);
7119
7120    return JSON.toString(info);
7121  }
7122
7123  @Override  // NameNodeMXBean
7124  public String getNameJournalStatus() {
7125    List<Map<String, String>> jasList = new ArrayList<Map<String, String>>();
7126    FSEditLog log = getFSImage().getEditLog();
7127    if (log != null) {
7128      // This flag can be false because we cannot hold a lock of FSEditLog
7129      // for metrics.
7130      boolean openForWrite = log.isOpenForWriteWithoutLock();
7131      for (JournalAndStream jas : log.getJournals()) {
7132        final Map<String, String> jasMap = new HashMap<String, String>();
7133        String manager = jas.getManager().toString();
7134
7135        jasMap.put("required", String.valueOf(jas.isRequired()));
7136        jasMap.put("disabled", String.valueOf(jas.isDisabled()));
7137        jasMap.put("manager", manager);
7138
7139        if (jas.isDisabled()) {
7140          jasMap.put("stream", "Failed");
7141        } else if (openForWrite) {
7142          EditLogOutputStream elos = jas.getCurrentStream();
7143          if (elos != null) {
7144            jasMap.put("stream", elos.generateReport());
7145          } else {
7146            jasMap.put("stream", "not currently writing");
7147          }
7148        } else {
7149          jasMap.put("stream", "open for read");
7150        }
7151        jasList.add(jasMap);
7152      }
7153    }
7154    return JSON.toString(jasList);
7155  }
7156
7157  @Override // NameNodeMxBean
7158  public String getJournalTransactionInfo() {
7159    Map<String, String> txnIdMap = new HashMap<String, String>();
7160    txnIdMap.put("LastAppliedOrWrittenTxId",
7161        Long.toString(this.getFSImage().getLastAppliedOrWrittenTxId()));
7162    txnIdMap.put("MostRecentCheckpointTxId",
7163        Long.toString(this.getFSImage().getMostRecentCheckpointTxId()));
7164    return JSON.toString(txnIdMap);
7165  }
7166  
7167  @Override  // NameNodeMXBean
7168  public String getNNStarted() {
7169    return getStartTime().toString();
7170  }
7171
7172  @Override  // NameNodeMXBean
7173  public String getCompileInfo() {
7174    return VersionInfo.getDate() + " by " + VersionInfo.getUser() +
7175        " from " + VersionInfo.getBranch();
7176  }
7177
7178  /** @return the block manager. */
7179  public BlockManager getBlockManager() {
7180    return blockManager;
7181  }
7182
7183  public BlockIdManager getBlockIdManager() {
7184    return blockIdManager;
7185  }
7186
7187  /** @return the FSDirectory. */
7188  @Override
7189  public FSDirectory getFSDirectory() {
7190    return dir;
7191  }
7192  /** Set the FSDirectory. */
7193  @VisibleForTesting
7194  public void setFSDirectory(FSDirectory dir) {
7195    this.dir = dir;
7196  }
7197  /** @return the cache manager. */
7198  public CacheManager getCacheManager() {
7199    return cacheManager;
7200  }
7201
7202  @Override  // NameNodeMXBean
7203  public String getCorruptFiles() {
7204    List<String> list = new ArrayList<String>();
7205    Collection<FSNamesystem.CorruptFileBlockInfo> corruptFileBlocks;
7206    try {
7207      corruptFileBlocks = listCorruptFileBlocks("/", null);
7208      int corruptFileCount = corruptFileBlocks.size();
7209      if (corruptFileCount != 0) {
7210        for (FSNamesystem.CorruptFileBlockInfo c : corruptFileBlocks) {
7211          list.add(c.toString());
7212        }
7213      }
7214    } catch (IOException e) {
7215      LOG.warn("Get corrupt file blocks returned error: " + e.getMessage());
7216    }
7217    return JSON.toString(list);
7218  }
7219
7220  @Override  //NameNodeMXBean
7221  public int getDistinctVersionCount() {
7222    return blockManager.getDatanodeManager().getDatanodesSoftwareVersions()
7223      .size();
7224  }
7225
7226  @Override  //NameNodeMXBean
7227  public Map<String, Integer> getDistinctVersions() {
7228    return blockManager.getDatanodeManager().getDatanodesSoftwareVersions();
7229  }
7230
7231  @Override  //NameNodeMXBean
7232  public String getSoftwareVersion() {
7233    return VersionInfo.getVersion();
7234  }
7235
7236  /**
7237   * Verifies that the given identifier and password are valid and match.
7238   * @param identifier Token identifier.
7239   * @param password Password in the token.
7240   */
7241  public synchronized void verifyToken(DelegationTokenIdentifier identifier,
7242      byte[] password) throws InvalidToken, RetriableException {
7243    try {
7244      getDelegationTokenSecretManager().verifyToken(identifier, password);
7245    } catch (InvalidToken it) {
7246      if (inTransitionToActive()) {
7247        throw new RetriableException(it);
7248      }
7249      throw it;
7250    }
7251  }
7252  
7253  @Override
7254  public boolean isGenStampInFuture(Block block) {
7255    return blockIdManager.isGenStampInFuture(block);
7256  }
7257
7258  @VisibleForTesting
7259  public EditLogTailer getEditLogTailer() {
7260    return editLogTailer;
7261  }
7262  
7263  @VisibleForTesting
7264  public void setEditLogTailerForTests(EditLogTailer tailer) {
7265    this.editLogTailer = tailer;
7266  }
7267  
7268  @VisibleForTesting
7269  void setFsLockForTests(ReentrantReadWriteLock lock) {
7270    this.fsLock.coarseLock = lock;
7271  }
7272  
7273  @VisibleForTesting
7274  public ReentrantReadWriteLock getFsLockForTests() {
7275    return fsLock.coarseLock;
7276  }
7277  
7278  @VisibleForTesting
7279  public ReentrantLock getCpLockForTests() {
7280    return cpLock;
7281  }
7282
7283  @VisibleForTesting
7284  public SafeModeInfo getSafeModeInfoForTests() {
7285    return safeMode;
7286  }
7287  
7288  @VisibleForTesting
7289  public void setNNResourceChecker(NameNodeResourceChecker nnResourceChecker) {
7290    this.nnResourceChecker = nnResourceChecker;
7291  }
7292
7293  public SnapshotManager getSnapshotManager() {
7294    return snapshotManager;
7295  }
7296  
7297  /** Allow snapshot on a directory. */
7298  void allowSnapshot(String path) throws IOException {
7299    checkOperation(OperationCategory.WRITE);
7300    final String operationName = "allowSnapshot";
7301    boolean success = false;
7302    writeLock();
7303    try {
7304      checkOperation(OperationCategory.WRITE);
7305      checkNameNodeSafeMode("Cannot allow snapshot for " + path);
7306      checkSuperuserPrivilege();
7307      FSDirSnapshotOp.allowSnapshot(dir, snapshotManager, path);
7308      success = true;
7309    } finally {
7310      writeUnlock(operationName);
7311    }
7312    getEditLog().logSync();
7313    logAuditEvent(success, operationName, path, null, null);
7314  }
7315  
7316  /** Disallow snapshot on a directory. */
7317  void disallowSnapshot(String path) throws IOException {
7318    checkOperation(OperationCategory.WRITE);
7319    final String operationName = "disallowSnapshot";
7320    boolean success = false;
7321    writeLock();
7322    try {
7323      checkOperation(OperationCategory.WRITE);
7324      checkNameNodeSafeMode("Cannot disallow snapshot for " + path);
7325      checkSuperuserPrivilege();
7326      FSDirSnapshotOp.disallowSnapshot(dir, snapshotManager, path);
7327      success = true;
7328    } finally {
7329      writeUnlock(operationName);
7330    }
7331    getEditLog().logSync();
7332    logAuditEvent(success, operationName, path, null, null);
7333  }
7334  
7335  /**
7336   * Create a snapshot
7337   * @param snapshotRoot The directory path where the snapshot is taken
7338   * @param snapshotName The name of the snapshot
7339   */
7340  String createSnapshot(String snapshotRoot, String snapshotName,
7341                        boolean logRetryCache) throws IOException {
7342    final String operationName = "createSnapshot";
7343    String snapshotPath = null;
7344    writeLock();
7345    try {
7346      checkOperation(OperationCategory.WRITE);
7347      checkNameNodeSafeMode("Cannot create snapshot for " + snapshotRoot);
7348      snapshotPath = FSDirSnapshotOp.createSnapshot(dir,
7349          snapshotManager, snapshotRoot, snapshotName, logRetryCache);
7350    } finally {
7351      writeUnlock(operationName);
7352    }
7353    getEditLog().logSync();
7354    logAuditEvent(snapshotPath != null, operationName, snapshotRoot,
7355        snapshotPath, null);
7356    return snapshotPath;
7357  }
7358  
7359  /**
7360   * Rename a snapshot
7361   * @param path The directory path where the snapshot was taken
7362   * @param snapshotOldName Old snapshot name
7363   * @param snapshotNewName New snapshot name
7364   * @throws SafeModeException
7365   * @throws IOException 
7366   */
7367  void renameSnapshot(
7368      String path, String snapshotOldName, String snapshotNewName,
7369      boolean logRetryCache) throws IOException {
7370    final String operationName = "renameSnapshot";
7371    boolean success = false;
7372    writeLock();
7373    try {
7374      checkOperation(OperationCategory.WRITE);
7375      checkNameNodeSafeMode("Cannot rename snapshot for " + path);
7376      FSDirSnapshotOp.renameSnapshot(dir, snapshotManager, path,
7377          snapshotOldName, snapshotNewName, logRetryCache);
7378      success = true;
7379    } finally {
7380      writeUnlock(operationName);
7381    }
7382    getEditLog().logSync();
7383    String oldSnapshotRoot = Snapshot.getSnapshotPath(path, snapshotOldName);
7384    String newSnapshotRoot = Snapshot.getSnapshotPath(path, snapshotNewName);
7385    logAuditEvent(success, operationName, oldSnapshotRoot,
7386        newSnapshotRoot, null);
7387  }
7388  
7389  /**
7390   * Get the list of snapshottable directories that are owned 
7391   * by the current user. Return all the snapshottable directories if the 
7392   * current user is a super user.
7393   * @return The list of all the current snapshottable directories
7394   * @throws IOException
7395   */
7396  public SnapshottableDirectoryStatus[] getSnapshottableDirListing()
7397      throws IOException {
7398    final String operationName = "listSnapshottableDirectory";
7399    SnapshottableDirectoryStatus[] status = null;
7400    checkOperation(OperationCategory.READ);
7401    boolean success = false;
7402    readLock();
7403    try {
7404      checkOperation(OperationCategory.READ);
7405      status = FSDirSnapshotOp.getSnapshottableDirListing(dir, snapshotManager);
7406      success = true;
7407    } finally {
7408      readUnlock(operationName);
7409    }
7410    logAuditEvent(success, operationName, null, null, null);
7411    return status;
7412  }
7413  
7414  /**
7415   * Get the difference between two snapshots (or between a snapshot and the
7416   * current status) of a snapshottable directory.
7417   * 
7418   * @param path The full path of the snapshottable directory.
7419   * @param fromSnapshot Name of the snapshot to calculate the diff from. Null
7420   *          or empty string indicates the current tree.
7421   * @param toSnapshot Name of the snapshot to calculated the diff to. Null or
7422   *          empty string indicates the current tree.
7423   * @return A report about the difference between {@code fromSnapshot} and 
7424   *         {@code toSnapshot}. Modified/deleted/created/renamed files and 
7425   *         directories belonging to the snapshottable directories are listed 
7426   *         and labeled as M/-/+/R respectively. 
7427   * @throws IOException
7428   */
7429  SnapshotDiffReport getSnapshotDiffReport(String path,
7430      String fromSnapshot, String toSnapshot) throws IOException {
7431    final String operationName = "computeSnapshotDiff";
7432    SnapshotDiffReport diffs = null;
7433    checkOperation(OperationCategory.READ);
7434    readLock();
7435    try {
7436      checkOperation(OperationCategory.READ);
7437      diffs = FSDirSnapshotOp.getSnapshotDiffReport(dir, snapshotManager,
7438          path, fromSnapshot, toSnapshot);
7439    } finally {
7440      readUnlock(operationName);
7441    }
7442
7443    logAuditEvent(diffs != null, operationName, null, null, null);
7444    return diffs;
7445  }
7446  
7447  /**
7448   * Delete a snapshot of a snapshottable directory
7449   * @param snapshotRoot The snapshottable directory
7450   * @param snapshotName The name of the to-be-deleted snapshot
7451   * @throws SafeModeException
7452   * @throws IOException
7453   */
7454  void deleteSnapshot(String snapshotRoot, String snapshotName,
7455      boolean logRetryCache) throws IOException {
7456    final String operationName = "deleteSnapshot";
7457    boolean success = false;
7458    writeLock();
7459    BlocksMapUpdateInfo blocksToBeDeleted = null;
7460    try {
7461      checkOperation(OperationCategory.WRITE);
7462      checkNameNodeSafeMode("Cannot delete snapshot for " + snapshotRoot);
7463
7464      blocksToBeDeleted = FSDirSnapshotOp.deleteSnapshot(dir, snapshotManager,
7465          snapshotRoot, snapshotName, logRetryCache);
7466      success = true;
7467    } finally {
7468      writeUnlock(operationName);
7469    }
7470    getEditLog().logSync();
7471
7472    // Breaking the pattern as removing blocks have to happen outside of the
7473    // global lock
7474    if (blocksToBeDeleted != null) {
7475      removeBlocks(blocksToBeDeleted);
7476    }
7477
7478    String rootPath = Snapshot.getSnapshotPath(snapshotRoot, snapshotName);
7479    logAuditEvent(success, operationName, rootPath, null, null);
7480  }
7481
7482  /**
7483   * Remove a list of INodeDirectorySnapshottable from the SnapshotManager
7484   * @param toRemove the list of INodeDirectorySnapshottable to be removed
7485   */
7486  void removeSnapshottableDirs(List<INodeDirectory> toRemove) {
7487    if (snapshotManager != null) {
7488      snapshotManager.removeSnapshottable(toRemove);
7489    }
7490  }
7491
7492  RollingUpgradeInfo queryRollingUpgrade() throws IOException {
7493    checkSuperuserPrivilege();
7494    checkOperation(OperationCategory.READ);
7495    readLock();
7496    try {
7497      if (!isRollingUpgrade()) {
7498        return null;
7499      }
7500      Preconditions.checkNotNull(rollingUpgradeInfo);
7501      boolean hasRollbackImage = this.getFSImage().hasRollbackFSImage();
7502      rollingUpgradeInfo.setCreatedRollbackImages(hasRollbackImage);
7503      return rollingUpgradeInfo;
7504    } finally {
7505      readUnlock("queryRollingUpgrade");
7506    }
7507  }
7508
7509  RollingUpgradeInfo startRollingUpgrade() throws IOException {
7510    final String operationName = "startRollingUpgrade";
7511    checkSuperuserPrivilege();
7512    checkOperation(OperationCategory.WRITE);
7513    writeLock();
7514    try {
7515      checkOperation(OperationCategory.WRITE);
7516      if (isRollingUpgrade()) {
7517        return rollingUpgradeInfo;
7518      }
7519      long startTime = now();
7520      if (!haEnabled) { // for non-HA, we require NN to be in safemode
7521        startRollingUpgradeInternalForNonHA(startTime);
7522      } else { // for HA, NN cannot be in safemode
7523        checkNameNodeSafeMode("Failed to start rolling upgrade");
7524        startRollingUpgradeInternal(startTime);
7525      }
7526
7527      getEditLog().logStartRollingUpgrade(rollingUpgradeInfo.getStartTime());
7528      if (haEnabled) {
7529        // roll the edit log to make sure the standby NameNode can tail
7530        getFSImage().rollEditLog();
7531      }
7532    } finally {
7533      writeUnlock(operationName);
7534    }
7535
7536    getEditLog().logSync();
7537    if (auditLog.isInfoEnabled() && isExternalInvocation()) {
7538      logAuditEvent(true, operationName, null, null, null);
7539    }
7540    return rollingUpgradeInfo;
7541  }
7542
7543  /**
7544   * Update internal state to indicate that a rolling upgrade is in progress.
7545   * @param startTime rolling upgrade start time
7546   */
7547  void startRollingUpgradeInternal(long startTime)
7548      throws IOException {
7549    checkRollingUpgrade("start rolling upgrade");
7550    getFSImage().checkUpgrade();
7551    setRollingUpgradeInfo(false, startTime);
7552  }
7553
7554  /**
7555   * Update internal state to indicate that a rolling upgrade is in progress for
7556   * non-HA setup. This requires the namesystem is in SafeMode and after doing a
7557   * checkpoint for rollback the namesystem will quit the safemode automatically 
7558   */
7559  private void startRollingUpgradeInternalForNonHA(long startTime)
7560      throws IOException {
7561    Preconditions.checkState(!haEnabled);
7562    if (!isInSafeMode()) {
7563      throw new IOException("Safe mode should be turned ON "
7564          + "in order to create namespace image.");
7565    }
7566    checkRollingUpgrade("start rolling upgrade");
7567    getFSImage().checkUpgrade();
7568    // in non-HA setup, we do an extra checkpoint to generate a rollback image
7569    getFSImage().saveNamespace(this, NameNodeFile.IMAGE_ROLLBACK, null);
7570    LOG.info("Successfully saved namespace for preparing rolling upgrade.");
7571
7572    // leave SafeMode automatically
7573    setSafeMode(SafeModeAction.SAFEMODE_LEAVE);
7574    setRollingUpgradeInfo(true, startTime);
7575  }
7576
7577  void setRollingUpgradeInfo(boolean createdRollbackImages, long startTime) {
7578    rollingUpgradeInfo = new RollingUpgradeInfo(blockPoolId,
7579        createdRollbackImages, startTime, 0L);
7580  }
7581
7582  public void setCreatedRollbackImages(boolean created) {
7583    if (rollingUpgradeInfo != null) {
7584      rollingUpgradeInfo.setCreatedRollbackImages(created);
7585    }
7586  }
7587
7588  public RollingUpgradeInfo getRollingUpgradeInfo() {
7589    return rollingUpgradeInfo;
7590  }
7591
7592  public boolean isNeedRollbackFsImage() {
7593    return needRollbackFsImage;
7594  }
7595
7596  public void setNeedRollbackFsImage(boolean needRollbackFsImage) {
7597    this.needRollbackFsImage = needRollbackFsImage;
7598  }
7599
7600  @Override  // NameNodeMXBean
7601  public RollingUpgradeInfo.Bean getRollingUpgradeStatus() {
7602    if (!isRollingUpgrade()) {
7603      return null;
7604    }
7605    RollingUpgradeInfo upgradeInfo = getRollingUpgradeInfo();
7606    if (upgradeInfo.createdRollbackImages()) {
7607      return new RollingUpgradeInfo.Bean(upgradeInfo);
7608    }
7609    readLock();
7610    try {
7611      // check again after acquiring the read lock.
7612      upgradeInfo = getRollingUpgradeInfo();
7613      if (upgradeInfo == null) {
7614        return null;
7615      }
7616      if (!upgradeInfo.createdRollbackImages()) {
7617        boolean hasRollbackImage = this.getFSImage().hasRollbackFSImage();
7618        upgradeInfo.setCreatedRollbackImages(hasRollbackImage);
7619      }
7620    } catch (IOException ioe) {
7621      LOG.warn("Encountered exception setting Rollback Image", ioe);
7622    } finally {
7623      readUnlock("getRollingUpgradeStatus");
7624    }
7625    return new RollingUpgradeInfo.Bean(upgradeInfo);
7626  }
7627
7628  /** Is rolling upgrade in progress? */
7629  public boolean isRollingUpgrade() {
7630    return rollingUpgradeInfo != null && !rollingUpgradeInfo.isFinalized();
7631  }
7632
7633  void checkRollingUpgrade(String action) throws RollingUpgradeException {
7634    if (isRollingUpgrade()) {
7635      throw new RollingUpgradeException("Failed to " + action
7636          + " since a rolling upgrade is already in progress."
7637          + " Existing rolling upgrade info:\n" + rollingUpgradeInfo);
7638    }
7639  }
7640
7641  RollingUpgradeInfo finalizeRollingUpgrade() throws IOException {
7642    final String operationName = "finalizeRollingUpgrade";
7643    checkSuperuserPrivilege();
7644    checkOperation(OperationCategory.WRITE);
7645    writeLock();
7646    try {
7647      checkOperation(OperationCategory.WRITE);
7648      if (!isRollingUpgrade()) {
7649        return null;
7650      }
7651      checkNameNodeSafeMode("Failed to finalize rolling upgrade");
7652
7653      finalizeRollingUpgradeInternal(now());
7654      getEditLog().logFinalizeRollingUpgrade(rollingUpgradeInfo.getFinalizeTime());
7655      if (haEnabled) {
7656        // roll the edit log to make sure the standby NameNode can tail
7657        getFSImage().rollEditLog();
7658      }
7659      getFSImage().updateStorageVersion();
7660      getFSImage().renameCheckpoint(NameNodeFile.IMAGE_ROLLBACK,
7661          NameNodeFile.IMAGE);
7662    } finally {
7663      writeUnlock(operationName);
7664    }
7665
7666    if (!haEnabled) {
7667      // Sync not needed for ha since the edit was rolled after logging.
7668      getEditLog().logSync();
7669    }
7670
7671    if (auditLog.isInfoEnabled() && isExternalInvocation()) {
7672      logAuditEvent(true, operationName, null, null, null);
7673    }
7674    return rollingUpgradeInfo;
7675  }
7676
7677  void finalizeRollingUpgradeInternal(long finalizeTime) {
7678    // Set the finalize time
7679    rollingUpgradeInfo.finalize(finalizeTime);
7680  }
7681
7682  long addCacheDirective(CacheDirectiveInfo directive,
7683                         EnumSet<CacheFlag> flags, boolean logRetryCache)
7684      throws IOException {
7685    final String operationName = "addCacheDirective";
7686    CacheDirectiveInfo effectiveDirective = null;
7687    if (!flags.contains(CacheFlag.FORCE)) {
7688      cacheManager.waitForRescanIfNeeded();
7689    }
7690    writeLock();
7691    try {
7692      checkOperation(OperationCategory.WRITE);
7693      if (isInSafeMode()) {
7694        throw new SafeModeException(
7695            "Cannot add cache directive", safeMode);
7696      }
7697      effectiveDirective = FSNDNCacheOp.addCacheDirective(this, cacheManager,
7698          directive, flags, logRetryCache);
7699    } finally {
7700      writeUnlock(operationName);
7701      boolean success = effectiveDirective != null;
7702      if (success) {
7703        getEditLog().logSync();
7704      }
7705
7706      String effectiveDirectiveStr = effectiveDirective != null ?
7707          effectiveDirective.toString() : null;
7708      logAuditEvent(success, operationName, effectiveDirectiveStr,
7709          null, null);
7710    }
7711    return effectiveDirective != null ? effectiveDirective.getId() : 0;
7712  }
7713
7714  void modifyCacheDirective(CacheDirectiveInfo directive,
7715      EnumSet<CacheFlag> flags, boolean logRetryCache) throws IOException {
7716    final String operationName = "modifyCacheDirective";
7717    boolean success = false;
7718    if (!flags.contains(CacheFlag.FORCE)) {
7719      cacheManager.waitForRescanIfNeeded();
7720    }
7721    writeLock();
7722    try {
7723      checkOperation(OperationCategory.WRITE);
7724      if (isInSafeMode()) {
7725        throw new SafeModeException(
7726            "Cannot add cache directive", safeMode);
7727      }
7728      FSNDNCacheOp.modifyCacheDirective(this, cacheManager, directive, flags,
7729          logRetryCache);
7730      success = true;
7731    } finally {
7732      writeUnlock(operationName);
7733      if (success) {
7734        getEditLog().logSync();
7735      }
7736      String idStr = "{id: " + directive.getId().toString() + "}";
7737      logAuditEvent(success, "modifyCacheDirective", idStr,
7738          directive.toString(), null);
7739    }
7740  }
7741
7742  void removeCacheDirective(long id, boolean logRetryCache) throws IOException {
7743    final String operationName = "removeCacheDirective";
7744    boolean success = false;
7745    writeLock();
7746    try {
7747      checkOperation(OperationCategory.WRITE);
7748      if (isInSafeMode()) {
7749        throw new SafeModeException(
7750            "Cannot remove cache directives", safeMode);
7751      }
7752      FSNDNCacheOp.removeCacheDirective(this, cacheManager, id, logRetryCache);
7753      success = true;
7754    } finally {
7755      writeUnlock(operationName);
7756      String idStr = "{id: " + Long.toString(id) + "}";
7757      logAuditEvent(success, operationName, idStr, null,
7758          null);
7759    }
7760    getEditLog().logSync();
7761  }
7762
7763  BatchedListEntries<CacheDirectiveEntry> listCacheDirectives(
7764      long startId, CacheDirectiveInfo filter) throws IOException {
7765    final String operationName = "listCacheDirectives";
7766    checkOperation(OperationCategory.READ);
7767    BatchedListEntries<CacheDirectiveEntry> results;
7768    cacheManager.waitForRescanIfNeeded();
7769    readLock();
7770    boolean success = false;
7771    try {
7772      checkOperation(OperationCategory.READ);
7773      results = FSNDNCacheOp.listCacheDirectives(this, cacheManager, startId,
7774          filter);
7775      success = true;
7776    } finally {
7777      readUnlock(operationName);
7778      logAuditEvent(success, operationName, filter.toString(), null,
7779          null);
7780    }
7781    return results;
7782  }
7783
7784  void addCachePool(CachePoolInfo req, boolean logRetryCache)
7785      throws IOException {
7786    final String operationName = "addCachePool";
7787    writeLock();
7788    boolean success = false;
7789    String poolInfoStr = null;
7790    try {
7791      checkOperation(OperationCategory.WRITE);
7792      if (isInSafeMode()) {
7793        throw new SafeModeException(
7794            "Cannot add cache pool " + req.getPoolName(), safeMode);
7795      }
7796      CachePoolInfo info = FSNDNCacheOp.addCachePool(this, cacheManager, req,
7797          logRetryCache);
7798      poolInfoStr = info.toString();
7799      success = true;
7800    } finally {
7801      writeUnlock(operationName);
7802      logAuditEvent(success, operationName, poolInfoStr, null, null);
7803    }
7804    
7805    getEditLog().logSync();
7806  }
7807
7808  void modifyCachePool(CachePoolInfo req, boolean logRetryCache)
7809      throws IOException {
7810    final String operationName = "modifyCachePool";
7811    writeLock();
7812    boolean success = false;
7813    try {
7814      checkOperation(OperationCategory.WRITE);
7815      if (isInSafeMode()) {
7816        throw new SafeModeException(
7817            "Cannot modify cache pool " + req.getPoolName(), safeMode);
7818      }
7819      FSNDNCacheOp.modifyCachePool(this, cacheManager, req, logRetryCache);
7820      success = true;
7821    } finally {
7822      writeUnlock(operationName);
7823      String poolNameStr = "{poolName: " +
7824          (req == null ? null : req.getPoolName()) + "}";
7825      logAuditEvent(success, operationName, poolNameStr,
7826                    req == null ? null : req.toString(), null);
7827    }
7828
7829    getEditLog().logSync();
7830  }
7831
7832  void removeCachePool(String cachePoolName, boolean logRetryCache)
7833      throws IOException {
7834    final String operationName = "removeCachePool";
7835    writeLock();
7836    boolean success = false;
7837    try {
7838      checkOperation(OperationCategory.WRITE);
7839      if (isInSafeMode()) {
7840        throw new SafeModeException(
7841            "Cannot remove cache pool " + cachePoolName, safeMode);
7842      }
7843      FSNDNCacheOp.removeCachePool(this, cacheManager, cachePoolName,
7844          logRetryCache);
7845      success = true;
7846    } finally {
7847      writeUnlock(operationName);
7848      String poolNameStr = "{poolName: " + cachePoolName + "}";
7849      logAuditEvent(success, operationName, poolNameStr, null, null);
7850    }
7851    
7852    getEditLog().logSync();
7853  }
7854
7855  BatchedListEntries<CachePoolEntry> listCachePools(String prevKey)
7856      throws IOException {
7857    final String operationName = "listCachePools";
7858    BatchedListEntries<CachePoolEntry> results;
7859    checkOperation(OperationCategory.READ);
7860    boolean success = false;
7861    cacheManager.waitForRescanIfNeeded();
7862    readLock();
7863    try {
7864      checkOperation(OperationCategory.READ);
7865      results = FSNDNCacheOp.listCachePools(this, cacheManager, prevKey);
7866      success = true;
7867    } finally {
7868      readUnlock(operationName);
7869      logAuditEvent(success, operationName, null, null, null);
7870    }
7871    return results;
7872  }
7873
7874  void modifyAclEntries(final String src, List<AclEntry> aclSpec)
7875      throws IOException {
7876    final String operationName = "modifyAclEntries";
7877    HdfsFileStatus auditStat = null;
7878    checkOperation(OperationCategory.WRITE);
7879    writeLock();
7880    try {
7881      checkOperation(OperationCategory.WRITE);
7882      checkNameNodeSafeMode("Cannot modify ACL entries on " + src);
7883      auditStat = FSDirAclOp.modifyAclEntries(dir, src, aclSpec);
7884    } catch (AccessControlException e) {
7885      logAuditEvent(false, operationName, src);
7886      throw e;
7887    } finally {
7888      writeUnlock(operationName);
7889    }
7890    getEditLog().logSync();
7891    logAuditEvent(true, operationName, src, null, auditStat);
7892  }
7893
7894  void removeAclEntries(final String src, List<AclEntry> aclSpec)
7895      throws IOException {
7896    final String operationName = "removeAclEntries";
7897    checkOperation(OperationCategory.WRITE);
7898    HdfsFileStatus auditStat = null;
7899    writeLock();
7900    try {
7901      checkOperation(OperationCategory.WRITE);
7902      checkNameNodeSafeMode("Cannot remove ACL entries on " + src);
7903      auditStat = FSDirAclOp.removeAclEntries(dir, src, aclSpec);
7904    } catch (AccessControlException e) {
7905      logAuditEvent(false, operationName, src);
7906      throw e;
7907    } finally {
7908      writeUnlock(operationName);
7909    }
7910    getEditLog().logSync();
7911    logAuditEvent(true, operationName, src, null, auditStat);
7912  }
7913
7914  void removeDefaultAcl(final String src) throws IOException {
7915    final String operationName = "removeDefaultAcl";
7916    HdfsFileStatus auditStat = null;
7917    checkOperation(OperationCategory.WRITE);
7918    writeLock();
7919    try {
7920      checkOperation(OperationCategory.WRITE);
7921      checkNameNodeSafeMode("Cannot remove default ACL entries on " + src);
7922      auditStat = FSDirAclOp.removeDefaultAcl(dir, src);
7923    } catch (AccessControlException e) {
7924      logAuditEvent(false, operationName, src);
7925      throw e;
7926    } finally {
7927      writeUnlock(operationName);
7928    }
7929    getEditLog().logSync();
7930    logAuditEvent(true, operationName, src, null, auditStat);
7931  }
7932
7933  void removeAcl(final String src) throws IOException {
7934    final String operationName = "removeAcl";
7935    HdfsFileStatus auditStat = null;
7936    checkOperation(OperationCategory.WRITE);
7937    writeLock();
7938    try {
7939      checkOperation(OperationCategory.WRITE);
7940      checkNameNodeSafeMode("Cannot remove ACL on " + src);
7941      auditStat = FSDirAclOp.removeAcl(dir, src);
7942    } catch (AccessControlException e) {
7943      logAuditEvent(false, operationName, src);
7944      throw e;
7945    } finally {
7946      writeUnlock(operationName);
7947    }
7948    getEditLog().logSync();
7949    logAuditEvent(true, operationName, src, null, auditStat);
7950  }
7951
7952  void setAcl(final String src, List<AclEntry> aclSpec) throws IOException {
7953    final String operationName = "setAcl";
7954    HdfsFileStatus auditStat = null;
7955    checkOperation(OperationCategory.WRITE);
7956    writeLock();
7957    try {
7958      checkOperation(OperationCategory.WRITE);
7959      checkNameNodeSafeMode("Cannot set ACL on " + src);
7960      auditStat = FSDirAclOp.setAcl(dir, src, aclSpec);
7961    } catch (AccessControlException e) {
7962      logAuditEvent(false, operationName, src);
7963      throw e;
7964    } finally {
7965      writeUnlock(operationName);
7966    }
7967    getEditLog().logSync();
7968    logAuditEvent(true, operationName, src, null, auditStat);
7969  }
7970
7971  AclStatus getAclStatus(String src) throws IOException {
7972    final String operationName = "getAclStatus";
7973    checkOperation(OperationCategory.READ);
7974    boolean success = false;
7975    readLock();
7976    try {
7977      checkOperation(OperationCategory.READ);
7978      final AclStatus ret = FSDirAclOp.getAclStatus(dir, src);
7979      success = true;
7980      return ret;
7981    } finally {
7982      readUnlock(operationName);
7983      logAuditEvent(success, operationName, src);
7984    }
7985  }
7986
7987  /**
7988   * Create an encryption zone on directory src using the specified key.
7989   *
7990   * @param src     the path of a directory which will be the root of the
7991   *                encryption zone. The directory must be empty.
7992   * @param keyName name of a key which must be present in the configured
7993   *                KeyProvider.
7994   * @throws AccessControlException  if the caller is not the superuser.
7995   * @throws UnresolvedLinkException if the path can't be resolved.
7996   * @throws SafeModeException       if the Namenode is in safe mode.
7997   */
7998  void createEncryptionZone(final String src, final String keyName,
7999                            boolean logRetryCache)
8000    throws IOException, UnresolvedLinkException,
8001      SafeModeException, AccessControlException {
8002    try {
8003      if (provider == null) {
8004        throw new IOException(
8005            "Can't create an encryption zone for " + src +
8006            " since no key provider is available.");
8007      }
8008      if (keyName == null || keyName.isEmpty()) {
8009        throw new IOException("Must specify a key name when creating an " +
8010            "encryption zone");
8011      }
8012      KeyProvider.Metadata metadata = provider.getMetadata(keyName);
8013      if (metadata == null) {
8014        /*
8015         * It would be nice if we threw something more specific than
8016         * IOException when the key is not found, but the KeyProvider API
8017         * doesn't provide for that. If that API is ever changed to throw
8018         * something more specific (e.g. UnknownKeyException) then we can
8019         * update this to match it, or better yet, just rethrow the
8020         * KeyProvider's exception.
8021         */
8022        throw new IOException("Key " + keyName + " doesn't exist.");
8023      }
8024      // If the provider supports pool for EDEKs, this will fill in the pool
8025      generateEncryptedDataEncryptionKey(keyName);
8026      createEncryptionZoneInt(src, metadata.getCipher(),
8027          keyName, logRetryCache);
8028    } catch (AccessControlException e) {
8029      logAuditEvent(false, "createEncryptionZone", src);
8030      throw e;
8031    }
8032  }
8033
8034  private void createEncryptionZoneInt(final String srcArg, String cipher,
8035      String keyName, final boolean logRetryCache) throws IOException {
8036    final String operationName = "createEncryptionZone";
8037    String src = srcArg;
8038    HdfsFileStatus resultingStat = null;
8039    checkSuperuserPrivilege();
8040    FSPermissionChecker pc = getPermissionChecker();
8041    writeLock();
8042    try {
8043      checkSuperuserPrivilege();
8044      checkOperation(OperationCategory.WRITE);
8045      checkNameNodeSafeMode("Cannot create encryption zone on " + src);
8046      final INodesInPath iip = dir.resolvePathForWrite(pc, src);
8047      src = iip.getPath();
8048
8049      final CipherSuite suite = CipherSuite.convert(cipher);
8050      // For now this is hardcoded, as we only support one method.
8051      final CryptoProtocolVersion version =
8052          CryptoProtocolVersion.ENCRYPTION_ZONES;
8053      final XAttr ezXAttr = dir.createEncryptionZone(src, suite,
8054          version, keyName);
8055      List<XAttr> xAttrs = Lists.newArrayListWithCapacity(1);
8056      xAttrs.add(ezXAttr);
8057      getEditLog().logSetXAttrs(src, xAttrs, logRetryCache);
8058      resultingStat = dir.getAuditFileInfo(iip);
8059    } finally {
8060      writeUnlock(operationName);
8061    }
8062    getEditLog().logSync();
8063    logAuditEvent(true, operationName, srcArg, null, resultingStat);
8064  }
8065
8066  /**
8067   * Get the encryption zone for the specified path.
8068   *
8069   * @param srcArg the path of a file or directory to get the EZ for.
8070   * @return the EZ of the of the path or null if none.
8071   * @throws AccessControlException  if the caller is not the superuser.
8072   * @throws UnresolvedLinkException if the path can't be resolved.
8073   */
8074  EncryptionZone getEZForPath(final String srcArg)
8075    throws AccessControlException, UnresolvedLinkException, IOException {
8076    String src = srcArg;
8077    final String operationName = "getEZForPath";
8078    HdfsFileStatus resultingStat = null;
8079    boolean success = false;
8080    final FSPermissionChecker pc = getPermissionChecker();
8081    checkOperation(OperationCategory.READ);
8082    readLock();
8083    try {
8084      checkOperation(OperationCategory.READ);
8085      INodesInPath iip = dir.resolvePath(pc, src);
8086      if (isPermissionEnabled) {
8087        dir.checkPathAccess(pc, iip, FsAction.READ);
8088      }
8089      final EncryptionZone ret = dir.getEZForPath(iip);
8090      resultingStat = dir.getAuditFileInfo(iip);
8091      success = true;
8092      return ret;
8093    } finally {
8094      readUnlock(operationName);
8095      logAuditEvent(success, operationName, srcArg, null, resultingStat);
8096    }
8097  }
8098
8099  BatchedListEntries<EncryptionZone> listEncryptionZones(long prevId)
8100      throws IOException {
8101    final String operationName = "listEncryptionZones";
8102    boolean success = false;
8103    checkSuperuserPrivilege();
8104    checkOperation(OperationCategory.READ);
8105    readLock();
8106    try {
8107      checkSuperuserPrivilege();
8108      checkOperation(OperationCategory.READ);
8109      final BatchedListEntries<EncryptionZone> ret =
8110          dir.listEncryptionZones(prevId);
8111      success = true;
8112      return ret;
8113    } finally {
8114      readUnlock(operationName);
8115      logAuditEvent(success, operationName, null);
8116    }
8117  }
8118
8119  void setXAttr(String src, XAttr xAttr, EnumSet<XAttrSetFlag> flag,
8120                boolean logRetryCache)
8121      throws IOException {
8122    final String operationName = "setXAttr";
8123    HdfsFileStatus auditStat = null;
8124    writeLock();
8125    try {
8126      checkOperation(OperationCategory.WRITE);
8127      checkNameNodeSafeMode("Cannot set XAttr on " + src);
8128      auditStat = FSDirXAttrOp.setXAttr(dir, src, xAttr, flag, logRetryCache);
8129    } catch (AccessControlException e) {
8130      logAuditEvent(false, operationName, src);
8131      throw e;
8132    } finally {
8133      writeUnlock(operationName);
8134    }
8135    getEditLog().logSync();
8136    logAuditEvent(true, operationName, src, null, auditStat);
8137  }
8138
8139  List<XAttr> getXAttrs(final String src, List<XAttr> xAttrs)
8140      throws IOException {
8141    final String operationName = "getXAttrs";
8142    checkOperation(OperationCategory.READ);
8143    readLock();
8144    try {
8145      checkOperation(OperationCategory.READ);
8146      return FSDirXAttrOp.getXAttrs(dir, src, xAttrs);
8147    } catch (AccessControlException e) {
8148      logAuditEvent(false, operationName, src);
8149      throw e;
8150    } finally {
8151      readUnlock(operationName);
8152    }
8153  }
8154
8155  List<XAttr> listXAttrs(String src) throws IOException {
8156    final String operationName = "listXAttrs";
8157    checkOperation(OperationCategory.READ);
8158    readLock();
8159    try {
8160      checkOperation(OperationCategory.READ);
8161      return FSDirXAttrOp.listXAttrs(dir, src);
8162    } catch (AccessControlException e) {
8163      logAuditEvent(false, operationName, src);
8164      throw e;
8165    } finally {
8166      readUnlock(operationName);
8167    }
8168  }
8169
8170  void removeXAttr(String src, XAttr xAttr, boolean logRetryCache)
8171      throws IOException {
8172    final String operationName = "removeXAttr";
8173    HdfsFileStatus auditStat = null;
8174    writeLock();
8175    try {
8176      checkOperation(OperationCategory.WRITE);
8177      checkNameNodeSafeMode("Cannot remove XAttr entry on " + src);
8178      auditStat = FSDirXAttrOp.removeXAttr(dir, src, xAttr, logRetryCache);
8179    } catch (AccessControlException e) {
8180      logAuditEvent(false, operationName, src);
8181      throw e;
8182    } finally {
8183      writeUnlock(operationName);
8184    }
8185    getEditLog().logSync();
8186    logAuditEvent(true, operationName, src, null, auditStat);
8187  }
8188
8189  void checkAccess(String src, FsAction mode) throws IOException {
8190    final String operationName = "checkAccess";
8191    checkOperation(OperationCategory.READ);
8192    FSPermissionChecker pc = getPermissionChecker();
8193    readLock();
8194    try {
8195      checkOperation(OperationCategory.READ);
8196      final INodesInPath iip = dir.resolvePath(pc, src);
8197      src = iip.getPath();
8198      INode inode = iip.getLastINode();
8199      if (inode == null) {
8200        throw new FileNotFoundException("Path not found");
8201      }
8202      if (isPermissionEnabled) {
8203        dir.checkPathAccess(pc, iip, mode);
8204      }
8205    } catch (AccessControlException e) {
8206      logAuditEvent(false, operationName, src);
8207      throw e;
8208    } finally {
8209      readUnlock(operationName);
8210    }
8211  }
8212
8213  /**
8214   * Default AuditLogger implementation; used when no access logger is
8215   * defined in the config file. It can also be explicitly listed in the
8216   * config file.
8217   */
8218  private static class DefaultAuditLogger extends HdfsAuditLogger {
8219
8220    private boolean logTokenTrackingId;
8221
8222    @Override
8223    public void initialize(Configuration conf) {
8224      logTokenTrackingId = conf.getBoolean(
8225          DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_KEY,
8226          DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_DEFAULT);
8227    }
8228
8229    @Override
8230    public void logAuditEvent(boolean succeeded, String userName,
8231        InetAddress addr, String cmd, String src, String dst,
8232        FileStatus status, UserGroupInformation ugi,
8233        DelegationTokenSecretManager dtSecretManager) {
8234      if (auditLog.isInfoEnabled()) {
8235        final StringBuilder sb = auditBuffer.get();
8236        sb.setLength(0);
8237        sb.append("allowed=").append(succeeded).append("\t");
8238        sb.append("ugi=").append(userName).append("\t");
8239        sb.append("ip=").append(addr).append("\t");
8240        sb.append("cmd=").append(cmd).append("\t");
8241        sb.append("src=").append(src).append("\t");
8242        sb.append("dst=").append(dst).append("\t");
8243        if (null == status) {
8244          sb.append("perm=null");
8245        } else {
8246          sb.append("perm=");
8247          sb.append(status.getOwner()).append(":");
8248          sb.append(status.getGroup()).append(":");
8249          sb.append(status.getPermission());
8250        }
8251        if (logTokenTrackingId) {
8252          sb.append("\t").append("trackingId=");
8253          String trackingId = null;
8254          if (ugi != null && dtSecretManager != null
8255              && ugi.getAuthenticationMethod() == AuthenticationMethod.TOKEN) {
8256            for (TokenIdentifier tid: ugi.getTokenIdentifiers()) {
8257              if (tid instanceof DelegationTokenIdentifier) {
8258                DelegationTokenIdentifier dtid =
8259                    (DelegationTokenIdentifier)tid;
8260                trackingId = dtSecretManager.getTokenTrackingId(dtid);
8261                break;
8262              }
8263            }
8264          }
8265          sb.append(trackingId);
8266        }
8267        sb.append("\t").append("proto=");
8268        sb.append(NamenodeWebHdfsMethods.isWebHdfsInvocation() ? "webhdfs" : "rpc");
8269        logAuditMessage(sb.toString());
8270      }
8271    }
8272
8273    public void logAuditMessage(String message) {
8274      auditLog.info(message);
8275    }
8276  }
8277
8278  private static void enableAsyncAuditLog() {
8279    if (!(auditLog instanceof Log4JLogger)) {
8280      LOG.warn("Log4j is required to enable async auditlog");
8281      return;
8282    }
8283    Logger logger = ((Log4JLogger)auditLog).getLogger();
8284    @SuppressWarnings("unchecked")
8285    List<Appender> appenders = Collections.list(logger.getAllAppenders());
8286    // failsafe against trying to async it more than once
8287    if (!appenders.isEmpty() && !(appenders.get(0) instanceof AsyncAppender)) {
8288      AsyncAppender asyncAppender = new AsyncAppender();
8289      // change logger to have an async appender containing all the
8290      // previously configured appenders
8291      for (Appender appender : appenders) {
8292        logger.removeAppender(appender);
8293        asyncAppender.addAppender(appender);
8294      }
8295      logger.addAppender(asyncAppender);        
8296    }
8297  }
8298
8299}
8300