001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hdfs.server.namenode; 019 020import static org.apache.hadoop.crypto.key.KeyProviderCryptoExtension.EncryptedKeyVersion; 021import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_TRASH_INTERVAL_DEFAULT; 022import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_TRASH_INTERVAL_KEY; 023import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_FILE_BUFFER_SIZE_DEFAULT; 024import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_FILE_BUFFER_SIZE_KEY; 025import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BLOCK_SIZE_DEFAULT; 026import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BLOCK_SIZE_KEY; 027import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BYTES_PER_CHECKSUM_DEFAULT; 028import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BYTES_PER_CHECKSUM_KEY; 029import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CHECKSUM_TYPE_DEFAULT; 030import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CHECKSUM_TYPE_KEY; 031import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_WRITE_PACKET_SIZE_DEFAULT; 032import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_WRITE_PACKET_SIZE_KEY; 033import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_ENCRYPT_DATA_TRANSFER_DEFAULT; 034import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_ENCRYPT_DATA_TRANSFER_KEY; 035import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_STANDBY_CHECKPOINTS_DEFAULT; 036import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_STANDBY_CHECKPOINTS_KEY; 037import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ACCESSTIME_PRECISION_DEFAULT; 038import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ACCESSTIME_PRECISION_KEY; 039import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOGGERS_KEY; 040import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_ASYNC_DEFAULT; 041import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_ASYNC_KEY; 042import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_DEFAULT; 043import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_KEY; 044import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_DEFAULT; 045import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_KEY; 046import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DEFAULT_AUDIT_LOGGER_NAME; 047import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_DEFAULT; 048import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_KEY; 049import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_DEFAULT; 050import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY; 051import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_DEFAULT; 052import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_KEY; 053import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_DEFAULT; 054import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_KEY; 055import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY; 056import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY; 057import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDIT_LOG_AUTOROLL_CHECK_INTERVAL_MS; 058import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDIT_LOG_AUTOROLL_CHECK_INTERVAL_MS_DEFAULT; 059import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDIT_LOG_AUTOROLL_MULTIPLIER_THRESHOLD; 060import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDIT_LOG_AUTOROLL_MULTIPLIER_THRESHOLD_DEFAULT; 061import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ENABLE_RETRY_CACHE_DEFAULT; 062import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ENABLE_RETRY_CACHE_KEY; 063import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_LAZY_PERSIST_FILE_SCRUB_INTERVAL_SEC; 064import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_LAZY_PERSIST_FILE_SCRUB_INTERVAL_SEC_DEFAULT; 065import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_INODE_ATTRIBUTES_PROVIDER_KEY; 066import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_MAX_OBJECTS_DEFAULT; 067import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_MAX_OBJECTS_KEY; 068import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_NAME_DIR_KEY; 069import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPLICATION_MIN_DEFAULT; 070import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPLICATION_MIN_KEY; 071import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPL_QUEUE_THRESHOLD_PCT_KEY; 072import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_DEFAULT; 073import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_KEY; 074import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_DEFAULT; 075import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_KEY; 076import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_DEFAULT; 077import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_KEY; 078import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_EXTENSION_KEY; 079import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_MIN_DATANODES_DEFAULT; 080import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY; 081import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_DEFAULT; 082import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_KEY; 083import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SHARED_EDITS_DIR_KEY; 084import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_ENABLED_DEFAULT; 085import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_ENABLED_KEY; 086import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_SUPERUSERGROUP_DEFAULT; 087import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_SUPERUSERGROUP_KEY; 088import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_REPLICATION_DEFAULT; 089import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_REPLICATION_KEY; 090import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_SUPPORT_APPEND_DEFAULT; 091import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_SUPPORT_APPEND_KEY; 092import static org.apache.hadoop.hdfs.server.common.HdfsServerConstants.SECURITY_XATTR_UNREADABLE_BY_SUPERUSER; 093import static org.apache.hadoop.util.Time.now; 094import static org.apache.hadoop.util.Time.monotonicNow; 095 096import java.io.BufferedWriter; 097import java.io.ByteArrayInputStream; 098import java.io.DataInput; 099import java.io.DataInputStream; 100import java.io.DataOutputStream; 101import java.io.File; 102import java.io.FileNotFoundException; 103import java.io.FileOutputStream; 104import java.io.IOException; 105import java.io.OutputStreamWriter; 106import java.io.PrintWriter; 107import java.io.StringWriter; 108import java.lang.management.ManagementFactory; 109import java.net.InetAddress; 110import java.net.URI; 111import java.security.GeneralSecurityException; 112import java.util.ArrayList; 113import java.util.Arrays; 114import java.util.Collection; 115import java.util.Collections; 116import java.util.Date; 117import java.util.EnumSet; 118import java.util.HashMap; 119import java.util.HashSet; 120import java.util.Iterator; 121import java.util.LinkedHashSet; 122import java.util.List; 123import java.util.Map; 124import java.util.Set; 125import java.util.TreeMap; 126import java.util.concurrent.TimeUnit; 127import java.util.concurrent.locks.Condition; 128import java.util.concurrent.locks.ReentrantLock; 129import java.util.concurrent.locks.ReentrantReadWriteLock; 130 131import javax.management.NotCompliantMBeanException; 132import javax.management.ObjectName; 133import javax.management.StandardMBean; 134 135import org.apache.commons.logging.Log; 136import org.apache.commons.logging.LogFactory; 137import org.apache.commons.logging.impl.Log4JLogger; 138import org.apache.hadoop.HadoopIllegalArgumentException; 139import org.apache.hadoop.classification.InterfaceAudience; 140import org.apache.hadoop.conf.Configuration; 141import org.apache.hadoop.crypto.CipherSuite; 142import org.apache.hadoop.crypto.CryptoProtocolVersion; 143import org.apache.hadoop.crypto.key.KeyProvider; 144import org.apache.hadoop.crypto.key.KeyProviderCryptoExtension; 145import org.apache.hadoop.fs.BatchedRemoteIterator.BatchedListEntries; 146import org.apache.hadoop.fs.CacheFlag; 147import org.apache.hadoop.fs.ContentSummary; 148import org.apache.hadoop.fs.CreateFlag; 149import org.apache.hadoop.fs.FileAlreadyExistsException; 150import org.apache.hadoop.fs.FileEncryptionInfo; 151import org.apache.hadoop.fs.FileStatus; 152import org.apache.hadoop.fs.FileSystem; 153import org.apache.hadoop.fs.FsServerDefaults; 154import org.apache.hadoop.fs.InvalidPathException; 155import org.apache.hadoop.fs.Options; 156import org.apache.hadoop.fs.ParentNotDirectoryException; 157import org.apache.hadoop.fs.Path; 158import org.apache.hadoop.fs.UnresolvedLinkException; 159import org.apache.hadoop.fs.XAttr; 160import org.apache.hadoop.fs.XAttrSetFlag; 161import org.apache.hadoop.fs.permission.AclEntry; 162import org.apache.hadoop.fs.permission.AclStatus; 163import org.apache.hadoop.fs.permission.FsAction; 164import org.apache.hadoop.fs.permission.FsPermission; 165import org.apache.hadoop.fs.permission.PermissionStatus; 166import org.apache.hadoop.fs.StorageType; 167import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState; 168import org.apache.hadoop.ha.ServiceFailedException; 169import org.apache.hadoop.hdfs.protocol.BlockStoragePolicy; 170import org.apache.hadoop.hdfs.DFSConfigKeys; 171import org.apache.hadoop.hdfs.DFSUtil; 172import org.apache.hadoop.hdfs.HAUtil; 173import org.apache.hadoop.hdfs.HdfsConfiguration; 174import org.apache.hadoop.hdfs.UnknownCryptoProtocolVersionException; 175import org.apache.hadoop.hdfs.XAttrHelper; 176import org.apache.hadoop.hdfs.protocol.AlreadyBeingCreatedException; 177import org.apache.hadoop.hdfs.protocol.Block; 178import org.apache.hadoop.hdfs.protocol.CacheDirectiveEntry; 179import org.apache.hadoop.hdfs.protocol.CacheDirectiveInfo; 180import org.apache.hadoop.hdfs.protocol.CachePoolEntry; 181import org.apache.hadoop.hdfs.protocol.CachePoolInfo; 182import org.apache.hadoop.hdfs.protocol.ClientProtocol; 183import org.apache.hadoop.hdfs.protocol.DatanodeID; 184import org.apache.hadoop.hdfs.protocol.DatanodeInfo; 185import org.apache.hadoop.hdfs.protocol.DirectoryListing; 186import org.apache.hadoop.hdfs.protocol.EncryptionZone; 187import org.apache.hadoop.hdfs.protocol.ExtendedBlock; 188import org.apache.hadoop.hdfs.protocol.HdfsConstants; 189import org.apache.hadoop.hdfs.protocol.LastBlockWithStatus; 190import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType; 191import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction; 192import org.apache.hadoop.hdfs.protocol.HdfsFileStatus; 193import org.apache.hadoop.hdfs.protocol.LocatedBlock; 194import org.apache.hadoop.hdfs.protocol.LocatedBlocks; 195import org.apache.hadoop.hdfs.protocol.QuotaExceededException; 196import org.apache.hadoop.hdfs.protocol.RecoveryInProgressException; 197import org.apache.hadoop.hdfs.protocol.RollingUpgradeException; 198import org.apache.hadoop.hdfs.protocol.RollingUpgradeInfo; 199import org.apache.hadoop.hdfs.protocol.SnapshotAccessControlException; 200import org.apache.hadoop.hdfs.protocol.SnapshotDiffReport; 201import org.apache.hadoop.hdfs.protocol.SnapshottableDirectoryStatus; 202import org.apache.hadoop.hdfs.protocol.datatransfer.ReplaceDatanodeOnFailure; 203import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager; 204import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager.AccessMode; 205import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier; 206import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenSecretManager; 207import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenSecretManager.SecretManagerState; 208import org.apache.hadoop.hdfs.server.blockmanagement.BlockCollection; 209import org.apache.hadoop.hdfs.server.blockmanagement.BlockIdManager; 210import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoContiguous; 211import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoContiguousUnderConstruction; 212import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager; 213import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor; 214import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeManager; 215import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeStatistics; 216import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeStorageInfo; 217import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.BlockUCState; 218import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.NamenodeRole; 219import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.RollingUpgradeStartupOption; 220import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption; 221import org.apache.hadoop.hdfs.server.common.Storage; 222import org.apache.hadoop.hdfs.server.common.Storage.StorageDirType; 223import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory; 224import org.apache.hadoop.hdfs.server.common.Util; 225import org.apache.hadoop.hdfs.server.namenode.FsImageProto.SecretManagerSection; 226import org.apache.hadoop.hdfs.server.namenode.INode.BlocksMapUpdateInfo; 227import org.apache.hadoop.hdfs.server.namenode.JournalSet.JournalAndStream; 228import org.apache.hadoop.hdfs.server.namenode.LeaseManager.Lease; 229import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeFile; 230import org.apache.hadoop.hdfs.server.namenode.NameNode.OperationCategory; 231import org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer; 232import org.apache.hadoop.hdfs.server.namenode.ha.HAContext; 233import org.apache.hadoop.hdfs.server.namenode.ha.StandbyCheckpointer; 234import org.apache.hadoop.hdfs.server.namenode.metrics.FSNamesystemMBean; 235import org.apache.hadoop.hdfs.server.namenode.metrics.NameNodeMetrics; 236import org.apache.hadoop.hdfs.server.namenode.snapshot.Snapshot; 237import org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotManager; 238import org.apache.hadoop.hdfs.server.namenode.startupprogress.Phase; 239import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress; 240import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress.Counter; 241import org.apache.hadoop.hdfs.server.namenode.startupprogress.Status; 242import org.apache.hadoop.hdfs.server.namenode.startupprogress.Step; 243import org.apache.hadoop.hdfs.server.namenode.startupprogress.StepType; 244import org.apache.hadoop.hdfs.server.namenode.top.TopAuditLogger; 245import org.apache.hadoop.hdfs.server.namenode.top.TopConf; 246import org.apache.hadoop.hdfs.server.namenode.top.metrics.TopMetrics; 247import org.apache.hadoop.hdfs.server.namenode.top.window.RollingWindowManager; 248import org.apache.hadoop.hdfs.server.namenode.web.resources.NamenodeWebHdfsMethods; 249import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand; 250import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration; 251import org.apache.hadoop.hdfs.server.protocol.DatanodeStorageReport; 252import org.apache.hadoop.hdfs.server.protocol.HeartbeatResponse; 253import org.apache.hadoop.hdfs.server.protocol.NNHAStatusHeartbeat; 254import org.apache.hadoop.hdfs.server.protocol.NamenodeCommand; 255import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration; 256import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo; 257import org.apache.hadoop.hdfs.server.protocol.StorageReceivedDeletedBlocks; 258import org.apache.hadoop.hdfs.server.protocol.StorageReport; 259import org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary; 260import org.apache.hadoop.io.EnumSetWritable; 261import org.apache.hadoop.io.IOUtils; 262import org.apache.hadoop.io.Text; 263import org.apache.hadoop.ipc.RetriableException; 264import org.apache.hadoop.ipc.RetryCache; 265import org.apache.hadoop.ipc.Server; 266import org.apache.hadoop.ipc.StandbyException; 267import org.apache.hadoop.metrics2.annotation.Metric; 268import org.apache.hadoop.metrics2.annotation.Metrics; 269import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; 270import org.apache.hadoop.metrics2.lib.MetricsRegistry; 271import org.apache.hadoop.metrics2.lib.MutableRatesWithAggregation; 272import org.apache.hadoop.metrics2.util.MBeans; 273import org.apache.hadoop.net.NetworkTopology; 274import org.apache.hadoop.net.Node; 275import org.apache.hadoop.net.NodeBase; 276import org.apache.hadoop.security.AccessControlException; 277import org.apache.hadoop.security.UserGroupInformation; 278import org.apache.hadoop.security.UserGroupInformation.AuthenticationMethod; 279import org.apache.hadoop.security.token.SecretManager.InvalidToken; 280import org.apache.hadoop.security.token.Token; 281import org.apache.hadoop.security.token.TokenIdentifier; 282import org.apache.hadoop.security.token.delegation.DelegationKey; 283import org.apache.hadoop.util.ChunkedArrayList; 284import org.apache.hadoop.util.Daemon; 285import org.apache.hadoop.util.DataChecksum; 286import org.apache.hadoop.util.ReflectionUtils; 287import org.apache.hadoop.util.StringUtils; 288import org.apache.hadoop.util.VersionInfo; 289import org.apache.log4j.Appender; 290import org.apache.log4j.AsyncAppender; 291import org.apache.log4j.Logger; 292import org.codehaus.jackson.map.ObjectMapper; 293import org.mortbay.util.ajax.JSON; 294 295import com.google.common.annotations.VisibleForTesting; 296import com.google.common.base.Charsets; 297import com.google.common.base.Preconditions; 298import com.google.common.collect.ImmutableMap; 299import com.google.common.collect.Lists; 300 301/*************************************************** 302 * FSNamesystem does the actual bookkeeping work for the 303 * DataNode. 304 * 305 * It tracks several important tables. 306 * 307 * 1) valid fsname --> blocklist (kept on disk, logged) 308 * 2) Set of all valid blocks (inverted #1) 309 * 3) block --> machinelist (kept in memory, rebuilt dynamically from reports) 310 * 4) machine --> blocklist (inverted #2) 311 * 5) LRU cache of updated-heartbeat machines 312 ***************************************************/ 313@InterfaceAudience.Private 314@Metrics(context="dfs") 315public class FSNamesystem implements Namesystem, FSNamesystemMBean, 316 NameNodeMXBean { 317 public static final Log LOG = LogFactory.getLog(FSNamesystem.class); 318 private final MetricsRegistry registry = new MetricsRegistry("FSNamesystem"); 319 @Metric final MutableRatesWithAggregation detailedLockHoldTimeMetrics = 320 registry.newRatesWithAggregation("detailedLockHoldTimeMetrics"); 321 322 private static final ThreadLocal<StringBuilder> auditBuffer = 323 new ThreadLocal<StringBuilder>() { 324 @Override 325 protected StringBuilder initialValue() { 326 return new StringBuilder(); 327 } 328 }; 329 330 private final BlockIdManager blockIdManager; 331 332 @VisibleForTesting 333 public boolean isAuditEnabled() { 334 return !isDefaultAuditLogger || auditLog.isInfoEnabled(); 335 } 336 337 private void logAuditEvent(boolean succeeded, String cmd, String src) 338 throws IOException { 339 logAuditEvent(succeeded, cmd, src, null, null); 340 } 341 342 private void logAuditEvent(boolean succeeded, String cmd, String src, 343 String dst, HdfsFileStatus stat) throws IOException { 344 if (isAuditEnabled() && isExternalInvocation()) { 345 logAuditEvent(succeeded, getRemoteUser(), getRemoteIp(), 346 cmd, src, dst, stat); 347 } 348 } 349 350 private void logAuditEvent(boolean succeeded, 351 UserGroupInformation ugi, InetAddress addr, String cmd, String src, 352 String dst, HdfsFileStatus stat) { 353 FileStatus status = null; 354 if (stat != null) { 355 Path symlink = stat.isSymlink() ? new Path(stat.getSymlink()) : null; 356 Path path = dst != null ? new Path(dst) : new Path(src); 357 status = new FileStatus(stat.getLen(), stat.isDir(), 358 stat.getReplication(), stat.getBlockSize(), stat.getModificationTime(), 359 stat.getAccessTime(), stat.getPermission(), stat.getOwner(), 360 stat.getGroup(), symlink, path); 361 } 362 for (AuditLogger logger : auditLoggers) { 363 if (logger instanceof HdfsAuditLogger) { 364 HdfsAuditLogger hdfsLogger = (HdfsAuditLogger) logger; 365 hdfsLogger.logAuditEvent(succeeded, ugi.toString(), addr, cmd, src, dst, 366 status, ugi, dtSecretManager); 367 } else { 368 logger.logAuditEvent(succeeded, ugi.toString(), addr, 369 cmd, src, dst, status); 370 } 371 } 372 } 373 374 /** 375 * Logger for audit events, noting successful FSNamesystem operations. Emits 376 * to FSNamesystem.audit at INFO. Each event causes a set of tab-separated 377 * <code>key=value</code> pairs to be written for the following properties: 378 * <code> 379 * ugi=<ugi in RPC> 380 * ip=<remote IP> 381 * cmd=<command> 382 * src=<src path> 383 * dst=<dst path (optional)> 384 * perm=<permissions (optional)> 385 * </code> 386 */ 387 public static final Log auditLog = LogFactory.getLog( 388 FSNamesystem.class.getName() + ".audit"); 389 390 static final int DEFAULT_MAX_CORRUPT_FILEBLOCKS_RETURNED = 100; 391 static int BLOCK_DELETION_INCREMENT = 1000; 392 private final boolean isPermissionEnabled; 393 private final UserGroupInformation fsOwner; 394 private final String supergroup; 395 private final boolean standbyShouldCheckpoint; 396 397 // Scan interval is not configurable. 398 private static final long DELEGATION_TOKEN_REMOVER_SCAN_INTERVAL = 399 TimeUnit.MILLISECONDS.convert(1, TimeUnit.HOURS); 400 final DelegationTokenSecretManager dtSecretManager; 401 private final boolean alwaysUseDelegationTokensForTests; 402 403 private static final Step STEP_AWAITING_REPORTED_BLOCKS = 404 new Step(StepType.AWAITING_REPORTED_BLOCKS); 405 406 // Tracks whether the default audit logger is the only configured audit 407 // logger; this allows isAuditEnabled() to return false in case the 408 // underlying logger is disabled, and avoid some unnecessary work. 409 private final boolean isDefaultAuditLogger; 410 private final List<AuditLogger> auditLoggers; 411 412 /** The namespace tree. */ 413 FSDirectory dir; 414 private final BlockManager blockManager; 415 private final SnapshotManager snapshotManager; 416 private final CacheManager cacheManager; 417 private final DatanodeStatistics datanodeStatistics; 418 419 private String nameserviceId; 420 421 private volatile RollingUpgradeInfo rollingUpgradeInfo = null; 422 /** 423 * A flag that indicates whether the checkpointer should checkpoint a rollback 424 * fsimage. The edit log tailer sets this flag. The checkpoint will create a 425 * rollback fsimage if the flag is true, and then change the flag to false. 426 */ 427 private volatile boolean needRollbackFsImage; 428 429 // Block pool ID used by this namenode 430 private String blockPoolId; 431 432 final LeaseManager leaseManager = new LeaseManager(this); 433 434 volatile Daemon smmthread = null; // SafeModeMonitor thread 435 436 Daemon nnrmthread = null; // NamenodeResourceMonitor thread 437 438 Daemon nnEditLogRoller = null; // NameNodeEditLogRoller thread 439 440 // A daemon to periodically clean up corrupt lazyPersist files 441 // from the name space. 442 Daemon lazyPersistFileScrubber = null; 443 /** 444 * When an active namenode will roll its own edit log, in # edits 445 */ 446 private final long editLogRollerThreshold; 447 /** 448 * Check interval of an active namenode's edit log roller thread 449 */ 450 private final int editLogRollerInterval; 451 452 /** 453 * How frequently we scan and unlink corrupt lazyPersist files. 454 * (In seconds) 455 */ 456 private final int lazyPersistFileScrubIntervalSec; 457 458 private volatile boolean hasResourcesAvailable = false; 459 private volatile boolean fsRunning = true; 460 461 /** The start time of the namesystem. */ 462 private final long startTime = now(); 463 464 /** The interval of namenode checking for the disk space availability */ 465 private final long resourceRecheckInterval; 466 467 // The actual resource checker instance. 468 NameNodeResourceChecker nnResourceChecker; 469 470 private final FsServerDefaults serverDefaults; 471 private final boolean supportAppends; 472 private final ReplaceDatanodeOnFailure dtpReplaceDatanodeOnFailure; 473 474 private volatile SafeModeInfo safeMode; // safe mode information 475 476 private final long maxFsObjects; // maximum number of fs objects 477 478 private final long minBlockSize; // minimum block size 479 private final long maxBlocksPerFile; // maximum # of blocks per file 480 481 // precision of access times. 482 private final long accessTimePrecision; 483 484 /** Lock to protect FSNamesystem. */ 485 private final FSNamesystemLock fsLock; 486 487 /** 488 * Checkpoint lock to protect FSNamesystem modification on standby NNs. 489 * Unlike fsLock, it does not affect block updates. On active NNs, this lock 490 * does not provide proper protection, because there are operations that 491 * modify both block and name system state. Even on standby, fsLock is 492 * used when block state changes need to be blocked. 493 */ 494 private final ReentrantLock cpLock; 495 496 /** 497 * Used when this NN is in standby state to read from the shared edit log. 498 */ 499 private EditLogTailer editLogTailer = null; 500 501 /** 502 * Used when this NN is in standby state to perform checkpoints. 503 */ 504 private StandbyCheckpointer standbyCheckpointer; 505 506 /** 507 * Reference to the NN's HAContext object. This is only set once 508 * {@link #startCommonServices(Configuration, HAContext)} is called. 509 */ 510 private HAContext haContext; 511 512 private final boolean haEnabled; 513 514 /** flag indicating whether replication queues have been initialized */ 515 boolean initializedReplQueues = false; 516 517 /** 518 * Whether the namenode is in the middle of starting the active service 519 */ 520 private volatile boolean startingActiveService = false; 521 522 private final RetryCache retryCache; 523 524 private KeyProviderCryptoExtension provider = null; 525 526 private volatile boolean imageLoaded = false; 527 private final Condition cond; 528 529 private final FSImage fsImage; 530 531 private final TopConf topConf; 532 private TopMetrics topMetrics; 533 534 private INodeAttributeProvider inodeAttributeProvider; 535 536 /** 537 * Notify that loading of this FSDirectory is complete, and 538 * it is imageLoaded for use 539 */ 540 void imageLoadComplete() { 541 Preconditions.checkState(!imageLoaded, "FSDirectory already loaded"); 542 setImageLoaded(); 543 } 544 545 void setImageLoaded() { 546 if(imageLoaded) return; 547 writeLock(); 548 try { 549 setImageLoaded(true); 550 dir.markNameCacheInitialized(); 551 cond.signalAll(); 552 } finally { 553 writeUnlock("setImageLoaded"); 554 } 555 } 556 557 //This is for testing purposes only 558 @VisibleForTesting 559 boolean isImageLoaded() { 560 return imageLoaded; 561 } 562 563 // exposed for unit tests 564 protected void setImageLoaded(boolean flag) { 565 imageLoaded = flag; 566 } 567 568 /** 569 * Block until the object is imageLoaded to be used. 570 */ 571 void waitForLoadingFSImage() { 572 if (!imageLoaded) { 573 writeLock(); 574 try { 575 while (!imageLoaded) { 576 try { 577 cond.await(5000, TimeUnit.MILLISECONDS); 578 } catch (InterruptedException ignored) { 579 } 580 } 581 } finally { 582 writeUnlock(); 583 } 584 } 585 } 586 587 /** 588 * Clear all loaded data 589 */ 590 void clear() { 591 dir.reset(); 592 dtSecretManager.reset(); 593 blockIdManager.clear(); 594 leaseManager.removeAllLeases(); 595 snapshotManager.clearSnapshottableDirs(); 596 cacheManager.clear(); 597 setImageLoaded(false); 598 blockManager.clear(); 599 } 600 601 @VisibleForTesting 602 LeaseManager getLeaseManager() { 603 return leaseManager; 604 } 605 606 boolean isHaEnabled() { 607 return haEnabled; 608 } 609 610 /** 611 * Check the supplied configuration for correctness. 612 * @param conf Supplies the configuration to validate. 613 * @throws IOException if the configuration could not be queried. 614 * @throws IllegalArgumentException if the configuration is invalid. 615 */ 616 private static void checkConfiguration(Configuration conf) 617 throws IOException { 618 619 final Collection<URI> namespaceDirs = 620 FSNamesystem.getNamespaceDirs(conf); 621 final Collection<URI> editsDirs = 622 FSNamesystem.getNamespaceEditsDirs(conf); 623 final Collection<URI> requiredEditsDirs = 624 FSNamesystem.getRequiredNamespaceEditsDirs(conf); 625 final Collection<URI> sharedEditsDirs = 626 FSNamesystem.getSharedEditsDirs(conf); 627 628 for (URI u : requiredEditsDirs) { 629 if (u.toString().compareTo( 630 DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_DEFAULT) == 0) { 631 continue; 632 } 633 634 // Each required directory must also be in editsDirs or in 635 // sharedEditsDirs. 636 if (!editsDirs.contains(u) && 637 !sharedEditsDirs.contains(u)) { 638 throw new IllegalArgumentException( 639 "Required edits directory " + u.toString() + " not present in " + 640 DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY + ". " + 641 DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY + "=" + 642 editsDirs.toString() + "; " + 643 DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY + "=" + 644 requiredEditsDirs.toString() + ". " + 645 DFSConfigKeys.DFS_NAMENODE_SHARED_EDITS_DIR_KEY + "=" + 646 sharedEditsDirs.toString() + "."); 647 } 648 } 649 650 if (namespaceDirs.size() == 1) { 651 LOG.warn("Only one image storage directory (" 652 + DFS_NAMENODE_NAME_DIR_KEY + ") configured. Beware of data loss" 653 + " due to lack of redundant storage directories!"); 654 } 655 if (editsDirs.size() == 1) { 656 LOG.warn("Only one namespace edits storage directory (" 657 + DFS_NAMENODE_EDITS_DIR_KEY + ") configured. Beware of data loss" 658 + " due to lack of redundant storage directories!"); 659 } 660 } 661 662 /** 663 * Instantiates an FSNamesystem loaded from the image and edits 664 * directories specified in the passed Configuration. 665 * 666 * @param conf the Configuration which specifies the storage directories 667 * from which to load 668 * @return an FSNamesystem which contains the loaded namespace 669 * @throws IOException if loading fails 670 */ 671 static FSNamesystem loadFromDisk(Configuration conf) throws IOException { 672 673 checkConfiguration(conf); 674 FSImage fsImage = new FSImage(conf, 675 FSNamesystem.getNamespaceDirs(conf), 676 FSNamesystem.getNamespaceEditsDirs(conf)); 677 FSNamesystem namesystem = new FSNamesystem(conf, fsImage, false); 678 StartupOption startOpt = NameNode.getStartupOption(conf); 679 if (startOpt == StartupOption.RECOVER) { 680 namesystem.setSafeMode(SafeModeAction.SAFEMODE_ENTER); 681 } 682 683 long loadStart = monotonicNow(); 684 try { 685 namesystem.loadFSImage(startOpt); 686 } catch (IOException ioe) { 687 LOG.warn("Encountered exception loading fsimage", ioe); 688 fsImage.close(); 689 throw ioe; 690 } 691 long timeTakenToLoadFSImage = monotonicNow() - loadStart; 692 LOG.info("Finished loading FSImage in " + timeTakenToLoadFSImage + " msecs"); 693 NameNodeMetrics nnMetrics = NameNode.getNameNodeMetrics(); 694 if (nnMetrics != null) { 695 nnMetrics.setFsImageLoadTime((int) timeTakenToLoadFSImage); 696 } 697 return namesystem; 698 } 699 700 FSNamesystem(Configuration conf, FSImage fsImage) throws IOException { 701 this(conf, fsImage, false); 702 } 703 704 /** 705 * Create an FSNamesystem associated with the specified image. 706 * 707 * Note that this does not load any data off of disk -- if you would 708 * like that behavior, use {@link #loadFromDisk(Configuration)} 709 * 710 * @param conf configuration 711 * @param fsImage The FSImage to associate with 712 * @param ignoreRetryCache Whether or not should ignore the retry cache setup 713 * step. For Secondary NN this should be set to true. 714 * @throws IOException on bad configuration 715 */ 716 FSNamesystem(Configuration conf, FSImage fsImage, boolean ignoreRetryCache) 717 throws IOException { 718 provider = DFSUtil.createKeyProviderCryptoExtension(conf); 719 if (provider == null) { 720 LOG.info("No KeyProvider found."); 721 } else { 722 LOG.info("Found KeyProvider: " + provider.toString()); 723 } 724 if (conf.getBoolean(DFS_NAMENODE_AUDIT_LOG_ASYNC_KEY, 725 DFS_NAMENODE_AUDIT_LOG_ASYNC_DEFAULT)) { 726 LOG.info("Enabling async auditlog"); 727 enableAsyncAuditLog(); 728 } 729 fsLock = new FSNamesystemLock(conf, detailedLockHoldTimeMetrics); 730 cond = fsLock.newWriteLockCondition(); 731 cpLock = new ReentrantLock(); 732 733 this.fsImage = fsImage; 734 try { 735 resourceRecheckInterval = conf.getLong( 736 DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_KEY, 737 DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_DEFAULT); 738 739 this.blockManager = new BlockManager(this, conf); 740 this.datanodeStatistics = blockManager.getDatanodeManager().getDatanodeStatistics(); 741 this.blockIdManager = new BlockIdManager(blockManager); 742 743 this.fsOwner = UserGroupInformation.getCurrentUser(); 744 this.supergroup = conf.get(DFS_PERMISSIONS_SUPERUSERGROUP_KEY, 745 DFS_PERMISSIONS_SUPERUSERGROUP_DEFAULT); 746 this.isPermissionEnabled = conf.getBoolean(DFS_PERMISSIONS_ENABLED_KEY, 747 DFS_PERMISSIONS_ENABLED_DEFAULT); 748 LOG.info("fsOwner = " + fsOwner); 749 LOG.info("supergroup = " + supergroup); 750 LOG.info("isPermissionEnabled = " + isPermissionEnabled); 751 752 // block allocation has to be persisted in HA using a shared edits directory 753 // so that the standby has up-to-date namespace information 754 nameserviceId = DFSUtil.getNamenodeNameServiceId(conf); 755 this.haEnabled = HAUtil.isHAEnabled(conf, nameserviceId); 756 757 // Sanity check the HA-related config. 758 if (nameserviceId != null) { 759 LOG.info("Determined nameservice ID: " + nameserviceId); 760 } 761 LOG.info("HA Enabled: " + haEnabled); 762 if (!haEnabled && HAUtil.usesSharedEditsDir(conf)) { 763 LOG.warn("Configured NNs:\n" + DFSUtil.nnAddressesAsString(conf)); 764 throw new IOException("Invalid configuration: a shared edits dir " + 765 "must not be specified if HA is not enabled."); 766 } 767 768 // Get the checksum type from config 769 String checksumTypeStr = conf.get(DFS_CHECKSUM_TYPE_KEY, DFS_CHECKSUM_TYPE_DEFAULT); 770 DataChecksum.Type checksumType; 771 try { 772 checksumType = DataChecksum.Type.valueOf(checksumTypeStr); 773 } catch (IllegalArgumentException iae) { 774 throw new IOException("Invalid checksum type in " 775 + DFS_CHECKSUM_TYPE_KEY + ": " + checksumTypeStr); 776 } 777 778 this.serverDefaults = new FsServerDefaults( 779 conf.getLongBytes(DFS_BLOCK_SIZE_KEY, DFS_BLOCK_SIZE_DEFAULT), 780 conf.getInt(DFS_BYTES_PER_CHECKSUM_KEY, DFS_BYTES_PER_CHECKSUM_DEFAULT), 781 conf.getInt(DFS_CLIENT_WRITE_PACKET_SIZE_KEY, DFS_CLIENT_WRITE_PACKET_SIZE_DEFAULT), 782 (short) conf.getInt(DFS_REPLICATION_KEY, DFS_REPLICATION_DEFAULT), 783 conf.getInt(IO_FILE_BUFFER_SIZE_KEY, IO_FILE_BUFFER_SIZE_DEFAULT), 784 conf.getBoolean(DFS_ENCRYPT_DATA_TRANSFER_KEY, DFS_ENCRYPT_DATA_TRANSFER_DEFAULT), 785 conf.getLong(FS_TRASH_INTERVAL_KEY, FS_TRASH_INTERVAL_DEFAULT), 786 checksumType); 787 788 this.maxFsObjects = conf.getLong(DFS_NAMENODE_MAX_OBJECTS_KEY, 789 DFS_NAMENODE_MAX_OBJECTS_DEFAULT); 790 791 this.minBlockSize = conf.getLong(DFSConfigKeys.DFS_NAMENODE_MIN_BLOCK_SIZE_KEY, 792 DFSConfigKeys.DFS_NAMENODE_MIN_BLOCK_SIZE_DEFAULT); 793 this.maxBlocksPerFile = conf.getLong(DFSConfigKeys.DFS_NAMENODE_MAX_BLOCKS_PER_FILE_KEY, 794 DFSConfigKeys.DFS_NAMENODE_MAX_BLOCKS_PER_FILE_DEFAULT); 795 this.accessTimePrecision = conf.getLong(DFS_NAMENODE_ACCESSTIME_PRECISION_KEY, 796 DFS_NAMENODE_ACCESSTIME_PRECISION_DEFAULT); 797 this.supportAppends = conf.getBoolean(DFS_SUPPORT_APPEND_KEY, DFS_SUPPORT_APPEND_DEFAULT); 798 LOG.info("Append Enabled: " + supportAppends); 799 800 this.dtpReplaceDatanodeOnFailure = ReplaceDatanodeOnFailure.get(conf); 801 802 this.standbyShouldCheckpoint = conf.getBoolean( 803 DFS_HA_STANDBY_CHECKPOINTS_KEY, DFS_HA_STANDBY_CHECKPOINTS_DEFAULT); 804 // # edit autoroll threshold is a multiple of the checkpoint threshold 805 this.editLogRollerThreshold = (long) 806 (conf.getFloat( 807 DFS_NAMENODE_EDIT_LOG_AUTOROLL_MULTIPLIER_THRESHOLD, 808 DFS_NAMENODE_EDIT_LOG_AUTOROLL_MULTIPLIER_THRESHOLD_DEFAULT) * 809 conf.getLong( 810 DFS_NAMENODE_CHECKPOINT_TXNS_KEY, 811 DFS_NAMENODE_CHECKPOINT_TXNS_DEFAULT)); 812 this.editLogRollerInterval = conf.getInt( 813 DFS_NAMENODE_EDIT_LOG_AUTOROLL_CHECK_INTERVAL_MS, 814 DFS_NAMENODE_EDIT_LOG_AUTOROLL_CHECK_INTERVAL_MS_DEFAULT); 815 816 this.lazyPersistFileScrubIntervalSec = conf.getInt( 817 DFS_NAMENODE_LAZY_PERSIST_FILE_SCRUB_INTERVAL_SEC, 818 DFS_NAMENODE_LAZY_PERSIST_FILE_SCRUB_INTERVAL_SEC_DEFAULT); 819 820 if (this.lazyPersistFileScrubIntervalSec == 0) { 821 throw new IllegalArgumentException( 822 DFS_NAMENODE_LAZY_PERSIST_FILE_SCRUB_INTERVAL_SEC + " must be non-zero."); 823 } 824 825 // For testing purposes, allow the DT secret manager to be started regardless 826 // of whether security is enabled. 827 alwaysUseDelegationTokensForTests = conf.getBoolean( 828 DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY, 829 DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_DEFAULT); 830 831 this.dtSecretManager = createDelegationTokenSecretManager(conf); 832 this.dir = new FSDirectory(this, conf); 833 this.snapshotManager = new SnapshotManager(dir); 834 this.cacheManager = new CacheManager(this, conf, blockManager); 835 this.safeMode = new SafeModeInfo(conf); 836 this.topConf = new TopConf(conf); 837 this.auditLoggers = initAuditLoggers(conf); 838 this.isDefaultAuditLogger = auditLoggers.size() == 1 && 839 auditLoggers.get(0) instanceof DefaultAuditLogger; 840 this.retryCache = ignoreRetryCache ? null : initRetryCache(conf); 841 Class<? extends INodeAttributeProvider> klass = conf.getClass( 842 DFS_NAMENODE_INODE_ATTRIBUTES_PROVIDER_KEY, 843 null, INodeAttributeProvider.class); 844 if (klass != null) { 845 inodeAttributeProvider = ReflectionUtils.newInstance(klass, conf); 846 LOG.info("Using INode attribute provider: " + klass.getName()); 847 } 848 } catch(IOException e) { 849 LOG.error(getClass().getSimpleName() + " initialization failed.", e); 850 close(); 851 throw e; 852 } catch (RuntimeException re) { 853 LOG.error(getClass().getSimpleName() + " initialization failed.", re); 854 close(); 855 throw re; 856 } 857 } 858 859 @VisibleForTesting 860 public List<AuditLogger> getAuditLoggers() { 861 return auditLoggers; 862 } 863 864 @VisibleForTesting 865 public RetryCache getRetryCache() { 866 return retryCache; 867 } 868 869 void lockRetryCache() { 870 if (retryCache != null) { 871 retryCache.lock(); 872 } 873 } 874 875 void unlockRetryCache() { 876 if (retryCache != null) { 877 retryCache.unlock(); 878 } 879 } 880 881 /** Whether or not retry cache is enabled */ 882 boolean hasRetryCache() { 883 return retryCache != null; 884 } 885 886 void addCacheEntryWithPayload(byte[] clientId, int callId, Object payload) { 887 if (retryCache != null) { 888 retryCache.addCacheEntryWithPayload(clientId, callId, payload); 889 } 890 } 891 892 void addCacheEntry(byte[] clientId, int callId) { 893 if (retryCache != null) { 894 retryCache.addCacheEntry(clientId, callId); 895 } 896 } 897 898 @VisibleForTesting 899 public KeyProviderCryptoExtension getProvider() { 900 return provider; 901 } 902 903 @VisibleForTesting 904 static RetryCache initRetryCache(Configuration conf) { 905 boolean enable = conf.getBoolean(DFS_NAMENODE_ENABLE_RETRY_CACHE_KEY, 906 DFS_NAMENODE_ENABLE_RETRY_CACHE_DEFAULT); 907 LOG.info("Retry cache on namenode is " + (enable ? "enabled" : "disabled")); 908 if (enable) { 909 float heapPercent = conf.getFloat( 910 DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_KEY, 911 DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_DEFAULT); 912 long entryExpiryMillis = conf.getLong( 913 DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_KEY, 914 DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_DEFAULT); 915 LOG.info("Retry cache will use " + heapPercent 916 + " of total heap and retry cache entry expiry time is " 917 + entryExpiryMillis + " millis"); 918 long entryExpiryNanos = entryExpiryMillis * 1000 * 1000; 919 return new RetryCache("NameNodeRetryCache", heapPercent, 920 entryExpiryNanos); 921 } 922 return null; 923 } 924 925 private List<AuditLogger> initAuditLoggers(Configuration conf) { 926 // Initialize the custom access loggers if configured. 927 Collection<String> alClasses = conf.getStringCollection(DFS_NAMENODE_AUDIT_LOGGERS_KEY); 928 List<AuditLogger> auditLoggers = Lists.newArrayList(); 929 if (alClasses != null && !alClasses.isEmpty()) { 930 for (String className : alClasses) { 931 try { 932 AuditLogger logger; 933 if (DFS_NAMENODE_DEFAULT_AUDIT_LOGGER_NAME.equals(className)) { 934 logger = new DefaultAuditLogger(); 935 } else { 936 logger = (AuditLogger) Class.forName(className).newInstance(); 937 } 938 logger.initialize(conf); 939 auditLoggers.add(logger); 940 } catch (RuntimeException re) { 941 throw re; 942 } catch (Exception e) { 943 throw new RuntimeException(e); 944 } 945 } 946 } 947 948 // Make sure there is at least one logger installed. 949 if (auditLoggers.isEmpty()) { 950 auditLoggers.add(new DefaultAuditLogger()); 951 } 952 953 // Add audit logger to calculate top users 954 if (topConf.isEnabled) { 955 topMetrics = new TopMetrics(conf, topConf.nntopReportingPeriodsMs); 956 auditLoggers.add(new TopAuditLogger(topMetrics)); 957 } 958 959 return Collections.unmodifiableList(auditLoggers); 960 } 961 962 private void loadFSImage(StartupOption startOpt) throws IOException { 963 final FSImage fsImage = getFSImage(); 964 965 // format before starting up if requested 966 if (startOpt == StartupOption.FORMAT) { 967 968 fsImage.format(this, fsImage.getStorage().determineClusterId());// reuse current id 969 970 startOpt = StartupOption.REGULAR; 971 } 972 boolean success = false; 973 writeLock(); 974 try { 975 // We shouldn't be calling saveNamespace if we've come up in standby state. 976 MetaRecoveryContext recovery = startOpt.createRecoveryContext(); 977 final boolean staleImage 978 = fsImage.recoverTransitionRead(startOpt, this, recovery); 979 if (RollingUpgradeStartupOption.ROLLBACK.matches(startOpt) || 980 RollingUpgradeStartupOption.DOWNGRADE.matches(startOpt)) { 981 rollingUpgradeInfo = null; 982 } 983 final boolean needToSave = staleImage && !haEnabled && !isRollingUpgrade(); 984 LOG.info("Need to save fs image? " + needToSave 985 + " (staleImage=" + staleImage + ", haEnabled=" + haEnabled 986 + ", isRollingUpgrade=" + isRollingUpgrade() + ")"); 987 if (needToSave) { 988 fsImage.saveNamespace(this); 989 } else { 990 updateStorageVersionForRollingUpgrade(fsImage.getLayoutVersion(), 991 startOpt); 992 // No need to save, so mark the phase done. 993 StartupProgress prog = NameNode.getStartupProgress(); 994 prog.beginPhase(Phase.SAVING_CHECKPOINT); 995 prog.endPhase(Phase.SAVING_CHECKPOINT); 996 } 997 // This will start a new log segment and write to the seen_txid file, so 998 // we shouldn't do it when coming up in standby state 999 if (!haEnabled || (haEnabled && startOpt == StartupOption.UPGRADE) 1000 || (haEnabled && startOpt == StartupOption.UPGRADEONLY)) { 1001 fsImage.openEditLogForWrite(); 1002 } 1003 success = true; 1004 } finally { 1005 if (!success) { 1006 fsImage.close(); 1007 } 1008 writeUnlock("loadFSImage"); 1009 } 1010 imageLoadComplete(); 1011 } 1012 1013 private void updateStorageVersionForRollingUpgrade(final long layoutVersion, 1014 StartupOption startOpt) throws IOException { 1015 boolean rollingStarted = RollingUpgradeStartupOption.STARTED 1016 .matches(startOpt) && layoutVersion > HdfsConstants 1017 .NAMENODE_LAYOUT_VERSION; 1018 boolean rollingRollback = RollingUpgradeStartupOption.ROLLBACK 1019 .matches(startOpt); 1020 if (rollingRollback || rollingStarted) { 1021 fsImage.updateStorageVersion(); 1022 } 1023 } 1024 1025 private void startSecretManager() { 1026 if (dtSecretManager != null) { 1027 try { 1028 dtSecretManager.startThreads(); 1029 } catch (IOException e) { 1030 // Inability to start secret manager 1031 // can't be recovered from. 1032 throw new RuntimeException(e); 1033 } 1034 } 1035 } 1036 1037 private void startSecretManagerIfNecessary() { 1038 boolean shouldRun = shouldUseDelegationTokens() && 1039 !isInSafeMode() && getEditLog().isOpenForWrite(); 1040 boolean running = dtSecretManager.isRunning(); 1041 if (shouldRun && !running) { 1042 startSecretManager(); 1043 } 1044 } 1045 1046 private void stopSecretManager() { 1047 if (dtSecretManager != null) { 1048 dtSecretManager.stopThreads(); 1049 } 1050 } 1051 1052 /** 1053 * Start services common to both active and standby states 1054 */ 1055 void startCommonServices(Configuration conf, HAContext haContext) throws IOException { 1056 this.registerMBean(); // register the MBean for the FSNamesystemState 1057 writeLock(); 1058 this.haContext = haContext; 1059 try { 1060 nnResourceChecker = new NameNodeResourceChecker(conf); 1061 checkAvailableResources(); 1062 assert safeMode != null && !isPopulatingReplQueues(); 1063 StartupProgress prog = NameNode.getStartupProgress(); 1064 prog.beginPhase(Phase.SAFEMODE); 1065 prog.setTotal(Phase.SAFEMODE, STEP_AWAITING_REPORTED_BLOCKS, 1066 getCompleteBlocksTotal()); 1067 setBlockTotal(); 1068 blockManager.activate(conf); 1069 } finally { 1070 writeUnlock("startCommonServices"); 1071 } 1072 1073 registerMXBean(); 1074 DefaultMetricsSystem.instance().register(this); 1075 if (inodeAttributeProvider != null) { 1076 inodeAttributeProvider.start(); 1077 dir.setINodeAttributeProvider(inodeAttributeProvider); 1078 } 1079 snapshotManager.registerMXBean(); 1080 } 1081 1082 /** 1083 * Stop services common to both active and standby states 1084 */ 1085 void stopCommonServices() { 1086 writeLock(); 1087 if (inodeAttributeProvider != null) { 1088 dir.setINodeAttributeProvider(null); 1089 inodeAttributeProvider.stop(); 1090 } 1091 try { 1092 if (blockManager != null) blockManager.close(); 1093 } finally { 1094 writeUnlock("stopCommonServices"); 1095 } 1096 RetryCache.clear(retryCache); 1097 } 1098 1099 /** 1100 * Start services required in active state 1101 * @throws IOException 1102 */ 1103 void startActiveServices() throws IOException { 1104 startingActiveService = true; 1105 LOG.info("Starting services required for active state"); 1106 writeLock(); 1107 try { 1108 FSEditLog editLog = getFSImage().getEditLog(); 1109 1110 if (!editLog.isOpenForWrite()) { 1111 // During startup, we're already open for write during initialization. 1112 editLog.initJournalsForWrite(); 1113 // May need to recover 1114 editLog.recoverUnclosedStreams(); 1115 1116 LOG.info("Catching up to latest edits from old active before " + 1117 "taking over writer role in edits logs"); 1118 editLogTailer.catchupDuringFailover(); 1119 1120 blockManager.setPostponeBlocksFromFuture(false); 1121 blockManager.getDatanodeManager().markAllDatanodesStale(); 1122 blockManager.clearQueues(); 1123 blockManager.processAllPendingDNMessages(); 1124 1125 // Only need to re-process the queue, If not in SafeMode. 1126 if (!isInSafeMode()) { 1127 LOG.info("Reprocessing replication and invalidation queues"); 1128 initializeReplQueues(); 1129 } 1130 1131 if (LOG.isDebugEnabled()) { 1132 LOG.debug("NameNode metadata after re-processing " + 1133 "replication and invalidation queues during failover:\n" + 1134 metaSaveAsString()); 1135 } 1136 1137 long nextTxId = getFSImage().getLastAppliedTxId() + 1; 1138 LOG.info("Will take over writing edit logs at txnid " + 1139 nextTxId); 1140 editLog.setNextTxId(nextTxId); 1141 1142 getFSImage().editLog.openForWrite(); 1143 } 1144 1145 // Enable quota checks. 1146 dir.enableQuotaChecks(); 1147 if (haEnabled) { 1148 // Renew all of the leases before becoming active. 1149 // This is because, while we were in standby mode, 1150 // the leases weren't getting renewed on this NN. 1151 // Give them all a fresh start here. 1152 leaseManager.renewAllLeases(); 1153 } 1154 leaseManager.startMonitor(); 1155 startSecretManagerIfNecessary(); 1156 1157 //ResourceMonitor required only at ActiveNN. See HDFS-2914 1158 this.nnrmthread = new Daemon(new NameNodeResourceMonitor()); 1159 nnrmthread.start(); 1160 1161 nnEditLogRoller = new Daemon(new NameNodeEditLogRoller( 1162 editLogRollerThreshold, editLogRollerInterval)); 1163 nnEditLogRoller.start(); 1164 1165 if (lazyPersistFileScrubIntervalSec > 0) { 1166 lazyPersistFileScrubber = new Daemon(new LazyPersistFileScrubber( 1167 lazyPersistFileScrubIntervalSec)); 1168 lazyPersistFileScrubber.start(); 1169 } 1170 1171 cacheManager.startMonitorThread(); 1172 blockManager.getDatanodeManager().setShouldSendCachingCommands(true); 1173 } finally { 1174 startingActiveService = false; 1175 checkSafeMode(); 1176 writeUnlock("startActiveServices"); 1177 } 1178 } 1179 1180 /** 1181 * Initialize replication queues. 1182 */ 1183 private void initializeReplQueues() { 1184 LOG.info("initializing replication queues"); 1185 blockManager.processMisReplicatedBlocks(); 1186 initializedReplQueues = true; 1187 } 1188 1189 private boolean inActiveState() { 1190 return haContext != null && 1191 haContext.getState().getServiceState() == HAServiceState.ACTIVE; 1192 } 1193 1194 /** 1195 * @return Whether the namenode is transitioning to active state and is in the 1196 * middle of the {@link #startActiveServices()} 1197 */ 1198 public boolean inTransitionToActive() { 1199 return haEnabled && inActiveState() && startingActiveService; 1200 } 1201 1202 private boolean shouldUseDelegationTokens() { 1203 return UserGroupInformation.isSecurityEnabled() || 1204 alwaysUseDelegationTokensForTests; 1205 } 1206 1207 /** 1208 * Stop services required in active state 1209 */ 1210 void stopActiveServices() { 1211 LOG.info("Stopping services started for active state"); 1212 writeLock(); 1213 try { 1214 stopSecretManager(); 1215 leaseManager.stopMonitor(); 1216 if (nnrmthread != null) { 1217 ((NameNodeResourceMonitor) nnrmthread.getRunnable()).stopMonitor(); 1218 nnrmthread.interrupt(); 1219 } 1220 if (nnEditLogRoller != null) { 1221 ((NameNodeEditLogRoller)nnEditLogRoller.getRunnable()).stop(); 1222 nnEditLogRoller.interrupt(); 1223 } 1224 if (lazyPersistFileScrubber != null) { 1225 ((LazyPersistFileScrubber) lazyPersistFileScrubber.getRunnable()).stop(); 1226 lazyPersistFileScrubber.interrupt(); 1227 } 1228 if (dir != null && getFSImage() != null) { 1229 if (getFSImage().editLog != null) { 1230 getFSImage().editLog.close(); 1231 } 1232 // Update the fsimage with the last txid that we wrote 1233 // so that the tailer starts from the right spot. 1234 getFSImage().updateLastAppliedTxIdFromWritten(); 1235 } 1236 if (cacheManager != null) { 1237 cacheManager.stopMonitorThread(); 1238 cacheManager.clearDirectiveStats(); 1239 } 1240 blockManager.getDatanodeManager().clearPendingCachingCommands(); 1241 blockManager.getDatanodeManager().setShouldSendCachingCommands(false); 1242 // Don't want to keep replication queues when not in Active. 1243 blockManager.clearQueues(); 1244 initializedReplQueues = false; 1245 } finally { 1246 writeUnlock("stopActiveServices"); 1247 } 1248 } 1249 1250 /** 1251 * Start services required in standby state 1252 * 1253 * @throws IOException 1254 */ 1255 void startStandbyServices(final Configuration conf) throws IOException { 1256 LOG.info("Starting services required for standby state"); 1257 if (!getFSImage().editLog.isOpenForRead()) { 1258 // During startup, we're already open for read. 1259 getFSImage().editLog.initSharedJournalsForRead(); 1260 } 1261 1262 blockManager.setPostponeBlocksFromFuture(true); 1263 1264 // Disable quota checks while in standby. 1265 dir.disableQuotaChecks(); 1266 editLogTailer = new EditLogTailer(this, conf); 1267 editLogTailer.start(); 1268 if (standbyShouldCheckpoint) { 1269 standbyCheckpointer = new StandbyCheckpointer(conf, this); 1270 standbyCheckpointer.start(); 1271 } 1272 } 1273 1274 /** 1275 * Called when the NN is in Standby state and the editlog tailer tails the 1276 * OP_ROLLING_UPGRADE_START. 1277 */ 1278 void triggerRollbackCheckpoint() { 1279 setNeedRollbackFsImage(true); 1280 if (standbyCheckpointer != null) { 1281 standbyCheckpointer.triggerRollbackCheckpoint(); 1282 } 1283 } 1284 1285 /** 1286 * Called while the NN is in Standby state, but just about to be 1287 * asked to enter Active state. This cancels any checkpoints 1288 * currently being taken. 1289 */ 1290 void prepareToStopStandbyServices() throws ServiceFailedException { 1291 if (standbyCheckpointer != null) { 1292 standbyCheckpointer.cancelAndPreventCheckpoints( 1293 "About to leave standby state"); 1294 } 1295 } 1296 1297 /** Stop services required in standby state */ 1298 void stopStandbyServices() throws IOException { 1299 LOG.info("Stopping services started for standby state"); 1300 if (standbyCheckpointer != null) { 1301 standbyCheckpointer.stop(); 1302 } 1303 if (editLogTailer != null) { 1304 editLogTailer.stop(); 1305 } 1306 if (dir != null && getFSImage() != null && getFSImage().editLog != null) { 1307 getFSImage().editLog.close(); 1308 } 1309 } 1310 1311 @Override 1312 public void checkOperation(OperationCategory op) throws StandbyException { 1313 if (haContext != null) { 1314 // null in some unit tests 1315 haContext.checkOperation(op); 1316 } 1317 } 1318 1319 /** 1320 * @throws RetriableException 1321 * If 1) The NameNode is in SafeMode, 2) HA is enabled, and 3) 1322 * NameNode is in active state 1323 * @throws SafeModeException 1324 * Otherwise if NameNode is in SafeMode. 1325 */ 1326 void checkNameNodeSafeMode(String errorMsg) 1327 throws RetriableException, SafeModeException { 1328 if (isInSafeMode()) { 1329 SafeModeException se = new SafeModeException(errorMsg, safeMode); 1330 if (haEnabled && haContext != null 1331 && haContext.getState().getServiceState() == HAServiceState.ACTIVE 1332 && shouldRetrySafeMode(this.safeMode)) { 1333 throw new RetriableException(se); 1334 } else { 1335 throw se; 1336 } 1337 } 1338 } 1339 1340 boolean isPermissionEnabled() { 1341 return isPermissionEnabled; 1342 } 1343 1344 /** 1345 * We already know that the safemode is on. We will throw a RetriableException 1346 * if the safemode is not manual or caused by low resource. 1347 */ 1348 private boolean shouldRetrySafeMode(SafeModeInfo safeMode) { 1349 if (safeMode == null) { 1350 return false; 1351 } else { 1352 return !safeMode.isManual() && !safeMode.areResourcesLow(); 1353 } 1354 } 1355 1356 public static Collection<URI> getNamespaceDirs(Configuration conf) { 1357 return getStorageDirs(conf, DFS_NAMENODE_NAME_DIR_KEY); 1358 } 1359 1360 /** 1361 * Get all edits dirs which are required. If any shared edits dirs are 1362 * configured, these are also included in the set of required dirs. 1363 * 1364 * @param conf the HDFS configuration. 1365 * @return all required dirs. 1366 */ 1367 public static Collection<URI> getRequiredNamespaceEditsDirs(Configuration conf) { 1368 Set<URI> ret = new HashSet<URI>(); 1369 ret.addAll(getStorageDirs(conf, DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY)); 1370 ret.addAll(getSharedEditsDirs(conf)); 1371 return ret; 1372 } 1373 1374 private static Collection<URI> getStorageDirs(Configuration conf, 1375 String propertyName) { 1376 Collection<String> dirNames = conf.getTrimmedStringCollection(propertyName); 1377 StartupOption startOpt = NameNode.getStartupOption(conf); 1378 if(startOpt == StartupOption.IMPORT) { 1379 // In case of IMPORT this will get rid of default directories 1380 // but will retain directories specified in hdfs-site.xml 1381 // When importing image from a checkpoint, the name-node can 1382 // start with empty set of storage directories. 1383 Configuration cE = new HdfsConfiguration(false); 1384 cE.addResource("core-default.xml"); 1385 cE.addResource("core-site.xml"); 1386 cE.addResource("hdfs-default.xml"); 1387 Collection<String> dirNames2 = cE.getTrimmedStringCollection(propertyName); 1388 dirNames.removeAll(dirNames2); 1389 if(dirNames.isEmpty()) 1390 LOG.warn("!!! WARNING !!!" + 1391 "\n\tThe NameNode currently runs without persistent storage." + 1392 "\n\tAny changes to the file system meta-data may be lost." + 1393 "\n\tRecommended actions:" + 1394 "\n\t\t- shutdown and restart NameNode with configured \"" 1395 + propertyName + "\" in hdfs-site.xml;" + 1396 "\n\t\t- use Backup Node as a persistent and up-to-date storage " + 1397 "of the file system meta-data."); 1398 } else if (dirNames.isEmpty()) { 1399 dirNames = Collections.singletonList( 1400 DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_DEFAULT); 1401 } 1402 return Util.stringCollectionAsURIs(dirNames); 1403 } 1404 1405 /** 1406 * Return an ordered list of edits directories to write to. 1407 * The list is ordered such that all shared edits directories 1408 * are ordered before non-shared directories, and any duplicates 1409 * are removed. The order they are specified in the configuration 1410 * is retained. 1411 * @return Collection of shared edits directories. 1412 * @throws IOException if multiple shared edits directories are configured 1413 */ 1414 public static List<URI> getNamespaceEditsDirs(Configuration conf) 1415 throws IOException { 1416 return getNamespaceEditsDirs(conf, true); 1417 } 1418 1419 public static List<URI> getNamespaceEditsDirs(Configuration conf, 1420 boolean includeShared) 1421 throws IOException { 1422 // Use a LinkedHashSet so that order is maintained while we de-dup 1423 // the entries. 1424 LinkedHashSet<URI> editsDirs = new LinkedHashSet<URI>(); 1425 1426 if (includeShared) { 1427 List<URI> sharedDirs = getSharedEditsDirs(conf); 1428 1429 // Fail until multiple shared edits directories are supported (HDFS-2782) 1430 if (sharedDirs.size() > 1) { 1431 throw new IOException( 1432 "Multiple shared edits directories are not yet supported"); 1433 } 1434 1435 // First add the shared edits dirs. It's critical that the shared dirs 1436 // are added first, since JournalSet syncs them in the order they are listed, 1437 // and we need to make sure all edits are in place in the shared storage 1438 // before they are replicated locally. See HDFS-2874. 1439 for (URI dir : sharedDirs) { 1440 if (!editsDirs.add(dir)) { 1441 LOG.warn("Edits URI " + dir + " listed multiple times in " + 1442 DFS_NAMENODE_SHARED_EDITS_DIR_KEY + ". Ignoring duplicates."); 1443 } 1444 } 1445 } 1446 // Now add the non-shared dirs. 1447 for (URI dir : getStorageDirs(conf, DFS_NAMENODE_EDITS_DIR_KEY)) { 1448 if (!editsDirs.add(dir)) { 1449 LOG.warn("Edits URI " + dir + " listed multiple times in " + 1450 DFS_NAMENODE_SHARED_EDITS_DIR_KEY + " and " + 1451 DFS_NAMENODE_EDITS_DIR_KEY + ". Ignoring duplicates."); 1452 } 1453 } 1454 1455 if (editsDirs.isEmpty()) { 1456 // If this is the case, no edit dirs have been explicitly configured. 1457 // Image dirs are to be used for edits too. 1458 return Lists.newArrayList(getNamespaceDirs(conf)); 1459 } else { 1460 return Lists.newArrayList(editsDirs); 1461 } 1462 } 1463 1464 /** 1465 * Returns edit directories that are shared between primary and secondary. 1466 * @param conf configuration 1467 * @return collection of edit directories from {@code conf} 1468 */ 1469 public static List<URI> getSharedEditsDirs(Configuration conf) { 1470 // don't use getStorageDirs here, because we want an empty default 1471 // rather than the dir in /tmp 1472 Collection<String> dirNames = conf.getTrimmedStringCollection( 1473 DFS_NAMENODE_SHARED_EDITS_DIR_KEY); 1474 return Util.stringCollectionAsURIs(dirNames); 1475 } 1476 1477 @Override 1478 public void readLock() { 1479 this.fsLock.readLock(); 1480 } 1481 @Override 1482 public void readUnlock() { 1483 this.fsLock.readUnlock(); 1484 } 1485 public void readUnlock(String opName) { 1486 this.fsLock.readUnlock(opName); 1487 } 1488 @Override 1489 public void writeLock() { 1490 this.fsLock.writeLock(); 1491 } 1492 @Override 1493 public void writeLockInterruptibly() throws InterruptedException { 1494 this.fsLock.writeLockInterruptibly(); 1495 } 1496 @Override 1497 public void writeUnlock() { 1498 this.fsLock.writeUnlock(); 1499 } 1500 public void writeUnlock(String opName) { 1501 this.fsLock.writeUnlock(opName); 1502 } 1503 @Override 1504 public boolean hasWriteLock() { 1505 return this.fsLock.isWriteLockedByCurrentThread(); 1506 } 1507 @Override 1508 public boolean hasReadLock() { 1509 return this.fsLock.getReadHoldCount() > 0 || hasWriteLock(); 1510 } 1511 1512 public int getReadHoldCount() { 1513 return this.fsLock.getReadHoldCount(); 1514 } 1515 1516 public int getWriteHoldCount() { 1517 return this.fsLock.getWriteHoldCount(); 1518 } 1519 1520 /** Lock the checkpoint lock */ 1521 public void cpLock() { 1522 this.cpLock.lock(); 1523 } 1524 1525 /** Lock the checkpoint lock interrupibly */ 1526 public void cpLockInterruptibly() throws InterruptedException { 1527 this.cpLock.lockInterruptibly(); 1528 } 1529 1530 /** Unlock the checkpoint lock */ 1531 public void cpUnlock() { 1532 this.cpLock.unlock(); 1533 } 1534 1535 1536 NamespaceInfo getNamespaceInfo() { 1537 readLock(); 1538 try { 1539 return unprotectedGetNamespaceInfo(); 1540 } finally { 1541 readUnlock("getNamespaceInfo"); 1542 } 1543 } 1544 1545 /** 1546 * Version of @see #getNamespaceInfo() that is not protected by a lock. 1547 */ 1548 NamespaceInfo unprotectedGetNamespaceInfo() { 1549 return new NamespaceInfo(getFSImage().getStorage().getNamespaceID(), 1550 getClusterId(), getBlockPoolId(), 1551 getFSImage().getStorage().getCTime()); 1552 } 1553 1554 /** 1555 * Close down this file system manager. 1556 * Causes heartbeat and lease daemons to stop; waits briefly for 1557 * them to finish, but a short timeout returns control back to caller. 1558 */ 1559 void close() { 1560 fsRunning = false; 1561 try { 1562 stopCommonServices(); 1563 if (smmthread != null) smmthread.interrupt(); 1564 } finally { 1565 // using finally to ensure we also wait for lease daemon 1566 try { 1567 stopActiveServices(); 1568 stopStandbyServices(); 1569 } catch (IOException ie) { 1570 } finally { 1571 IOUtils.cleanup(LOG, dir); 1572 IOUtils.cleanup(LOG, fsImage); 1573 } 1574 } 1575 } 1576 1577 @Override 1578 public boolean isRunning() { 1579 return fsRunning; 1580 } 1581 1582 @Override 1583 public boolean isInStandbyState() { 1584 if (haContext == null || haContext.getState() == null) { 1585 // We're still starting up. In this case, if HA is 1586 // on for the cluster, we always start in standby. Otherwise 1587 // start in active. 1588 return haEnabled; 1589 } 1590 1591 return HAServiceState.STANDBY == haContext.getState().getServiceState(); 1592 } 1593 1594 /** 1595 * Dump all metadata into specified file 1596 */ 1597 void metaSave(String filename) throws IOException { 1598 checkSuperuserPrivilege(); 1599 checkOperation(OperationCategory.UNCHECKED); 1600 writeLock(); 1601 try { 1602 checkOperation(OperationCategory.UNCHECKED); 1603 File file = new File(System.getProperty("hadoop.log.dir"), filename); 1604 PrintWriter out = new PrintWriter(new BufferedWriter( 1605 new OutputStreamWriter(new FileOutputStream(file), Charsets.UTF_8))); 1606 metaSave(out); 1607 out.flush(); 1608 out.close(); 1609 } finally { 1610 writeUnlock("metaSave"); 1611 } 1612 } 1613 1614 private void metaSave(PrintWriter out) { 1615 assert hasWriteLock(); 1616 long totalInodes = this.dir.totalInodes(); 1617 long totalBlocks = this.getBlocksTotal(); 1618 out.println(totalInodes + " files and directories, " + totalBlocks 1619 + " blocks = " + (totalInodes + totalBlocks) + " total"); 1620 1621 blockManager.metaSave(out); 1622 } 1623 1624 private String metaSaveAsString() { 1625 StringWriter sw = new StringWriter(); 1626 PrintWriter pw = new PrintWriter(sw); 1627 metaSave(pw); 1628 pw.flush(); 1629 return sw.toString(); 1630 } 1631 1632 FsServerDefaults getServerDefaults() throws StandbyException { 1633 checkOperation(OperationCategory.READ); 1634 return serverDefaults; 1635 } 1636 1637 long getAccessTimePrecision() { 1638 return accessTimePrecision; 1639 } 1640 1641 private boolean isAccessTimeSupported() { 1642 return accessTimePrecision > 0; 1643 } 1644 1645 ///////////////////////////////////////////////////////// 1646 // 1647 // These methods are called by HadoopFS clients 1648 // 1649 ///////////////////////////////////////////////////////// 1650 /** 1651 * Set permissions for an existing file. 1652 * @throws IOException 1653 */ 1654 void setPermission(String src, FsPermission permission) throws IOException { 1655 final String operationName = "setPermission"; 1656 HdfsFileStatus auditStat; 1657 checkOperation(OperationCategory.WRITE); 1658 writeLock(); 1659 try { 1660 checkOperation(OperationCategory.WRITE); 1661 checkNameNodeSafeMode("Cannot set permission for " + src); 1662 auditStat = FSDirAttrOp.setPermission(dir, src, permission); 1663 } catch (AccessControlException e) { 1664 logAuditEvent(false, operationName, src); 1665 throw e; 1666 } finally { 1667 writeUnlock(operationName); 1668 } 1669 getEditLog().logSync(); 1670 logAuditEvent(true, operationName, src, null, auditStat); 1671 } 1672 1673 /** 1674 * Set owner for an existing file. 1675 * @throws IOException 1676 */ 1677 void setOwner(String src, String username, String group) 1678 throws IOException { 1679 final String operationName = "setOwner"; 1680 HdfsFileStatus auditStat; 1681 checkOperation(OperationCategory.WRITE); 1682 writeLock(); 1683 try { 1684 checkOperation(OperationCategory.WRITE); 1685 checkNameNodeSafeMode("Cannot set owner for " + src); 1686 auditStat = FSDirAttrOp.setOwner(dir, src, username, group); 1687 } catch (AccessControlException e) { 1688 logAuditEvent(false, operationName, src); 1689 throw e; 1690 } finally { 1691 writeUnlock(operationName); 1692 } 1693 getEditLog().logSync(); 1694 logAuditEvent(true, operationName, src, null, auditStat); 1695 } 1696 1697 static class GetBlockLocationsResult { 1698 final boolean updateAccessTime; 1699 final LocatedBlocks blocks; 1700 boolean updateAccessTime() { 1701 return updateAccessTime; 1702 } 1703 private GetBlockLocationsResult( 1704 boolean updateAccessTime, LocatedBlocks blocks) { 1705 this.updateAccessTime = updateAccessTime; 1706 this.blocks = blocks; 1707 } 1708 } 1709 1710 /** 1711 * Get block locations within the specified range. 1712 * @see ClientProtocol#getBlockLocations(String, long, long) 1713 */ 1714 LocatedBlocks getBlockLocations(String clientMachine, String srcArg, 1715 long offset, long length) throws IOException { 1716 final String operationName = "open"; 1717 checkOperation(OperationCategory.READ); 1718 GetBlockLocationsResult res = null; 1719 FSPermissionChecker pc = getPermissionChecker(); 1720 readLock(); 1721 try { 1722 checkOperation(OperationCategory.READ); 1723 res = getBlockLocations(pc, srcArg, offset, length, true, true); 1724 } catch (AccessControlException e) { 1725 logAuditEvent(false, operationName, srcArg); 1726 throw e; 1727 } finally { 1728 readUnlock(operationName); 1729 } 1730 1731 logAuditEvent(true, operationName, srcArg); 1732 1733 if (res.updateAccessTime()) { 1734 String src = srcArg; 1735 writeLock(); 1736 final long now = now(); 1737 try { 1738 checkOperation(OperationCategory.WRITE); 1739 /** 1740 * Resolve the path again and update the atime only when the file 1741 * exists. 1742 * 1743 * XXX: Races can still occur even after resolving the path again. 1744 * For example: 1745 * 1746 * <ul> 1747 * <li>Get the block location for "/a/b"</li> 1748 * <li>Rename "/a/b" to "/c/b"</li> 1749 * <li>The second resolution still points to "/a/b", which is 1750 * wrong.</li> 1751 * </ul> 1752 * 1753 * The behavior is incorrect but consistent with the one before 1754 * HDFS-7463. A better fix is to change the edit log of SetTime to 1755 * use inode id instead of a path. 1756 */ 1757 final INodesInPath iip = dir.resolvePath(pc, src); 1758 src = iip.getPath(); 1759 INode inode = iip.getLastINode(); 1760 boolean updateAccessTime = inode != null && 1761 now > inode.getAccessTime() + getAccessTimePrecision(); 1762 if (!isInSafeMode() && updateAccessTime) { 1763 boolean changed = FSDirAttrOp.setTimes(dir, 1764 inode, -1, now, false, iip.getLatestSnapshotId()); 1765 if (changed) { 1766 getEditLog().logTimes(src, -1, now); 1767 } 1768 } 1769 } catch (Throwable e) { 1770 LOG.warn("Failed to update the access time of " + src, e); 1771 } finally { 1772 writeUnlock(operationName); 1773 } 1774 } 1775 1776 LocatedBlocks blocks = res.blocks; 1777 if (blocks != null) { 1778 blockManager.getDatanodeManager().sortLocatedBlocks( 1779 clientMachine, blocks.getLocatedBlocks()); 1780 1781 // lastBlock is not part of getLocatedBlocks(), might need to sort it too 1782 LocatedBlock lastBlock = blocks.getLastLocatedBlock(); 1783 if (lastBlock != null) { 1784 ArrayList<LocatedBlock> lastBlockList = Lists.newArrayList(lastBlock); 1785 blockManager.getDatanodeManager().sortLocatedBlocks( 1786 clientMachine, lastBlockList); 1787 } 1788 } 1789 return blocks; 1790 } 1791 1792 /** 1793 * Get block locations within the specified range. 1794 * @see ClientProtocol#getBlockLocations(String, long, long) 1795 * @throws IOException 1796 */ 1797 GetBlockLocationsResult getBlockLocations( 1798 FSPermissionChecker pc, String src, long offset, long length, 1799 boolean needBlockToken, boolean checkSafeMode) throws IOException { 1800 if (offset < 0) { 1801 throw new HadoopIllegalArgumentException( 1802 "Negative offset is not supported. File: " + src); 1803 } 1804 if (length < 0) { 1805 throw new HadoopIllegalArgumentException( 1806 "Negative length is not supported. File: " + src); 1807 } 1808 final GetBlockLocationsResult ret = getBlockLocationsInt( 1809 pc, src, offset, length, needBlockToken); 1810 1811 if (checkSafeMode && isInSafeMode()) { 1812 for (LocatedBlock b : ret.blocks.getLocatedBlocks()) { 1813 // if safemode & no block locations yet then throw safemodeException 1814 if ((b.getLocations() == null) || (b.getLocations().length == 0)) { 1815 SafeModeException se = new SafeModeException( 1816 "Zero blocklocations for " + src, safeMode); 1817 if (haEnabled && haContext != null && 1818 haContext.getState().getServiceState() == HAServiceState.ACTIVE) { 1819 throw new RetriableException(se); 1820 } else { 1821 throw se; 1822 } 1823 } 1824 } 1825 } 1826 return ret; 1827 } 1828 1829 private GetBlockLocationsResult getBlockLocationsInt( 1830 FSPermissionChecker pc, final String srcArg, long offset, long length, 1831 boolean needBlockToken) 1832 throws IOException { 1833 String src = srcArg; 1834 final INodesInPath iip = dir.resolvePath(pc, src); 1835 src = iip.getPath(); 1836 final INodeFile inode = INodeFile.valueOf(iip.getLastINode(), src); 1837 if (isPermissionEnabled) { 1838 dir.checkPathAccess(pc, iip, FsAction.READ); 1839 checkUnreadableBySuperuser(pc, inode, iip.getPathSnapshotId()); 1840 } 1841 1842 final long fileSize = iip.isSnapshot() 1843 ? inode.computeFileSize(iip.getPathSnapshotId()) 1844 : inode.computeFileSizeNotIncludingLastUcBlock(); 1845 boolean isUc = inode.isUnderConstruction(); 1846 if (iip.isSnapshot()) { 1847 // if src indicates a snapshot file, we need to make sure the returned 1848 // blocks do not exceed the size of the snapshot file. 1849 length = Math.min(length, fileSize - offset); 1850 isUc = false; 1851 } 1852 1853 final FileEncryptionInfo feInfo = 1854 FSDirectory.isReservedRawName(srcArg) ? null 1855 : dir.getFileEncryptionInfo(inode, iip.getPathSnapshotId(), iip); 1856 1857 final LocatedBlocks blocks = blockManager.createLocatedBlocks( 1858 inode.getBlocks(iip.getPathSnapshotId()), fileSize, 1859 isUc, offset, length, needBlockToken, iip.isSnapshot(), feInfo); 1860 1861 // Set caching information for the located blocks. 1862 for (LocatedBlock lb : blocks.getLocatedBlocks()) { 1863 cacheManager.setCachedLocations(lb); 1864 } 1865 1866 final long now = now(); 1867 boolean updateAccessTime = isAccessTimeSupported() && !isInSafeMode() 1868 && !iip.isSnapshot() 1869 && now > inode.getAccessTime() + getAccessTimePrecision(); 1870 return new GetBlockLocationsResult(updateAccessTime, blocks); 1871 } 1872 1873 /** 1874 * Moves all the blocks from {@code srcs} and appends them to {@code target} 1875 * To avoid rollbacks we will verify validity of ALL of the args 1876 * before we start actual move. 1877 * 1878 * This does not support ".inodes" relative path 1879 * @param target target to concat into 1880 * @param srcs file that will be concatenated 1881 * @throws IOException on error 1882 */ 1883 void concat(String target, String [] srcs, boolean logRetryCache) 1884 throws IOException { 1885 waitForLoadingFSImage(); 1886 final String operationName = "concat"; 1887 HdfsFileStatus stat = null; 1888 boolean success = false; 1889 writeLock(); 1890 try { 1891 checkOperation(OperationCategory.WRITE); 1892 checkNameNodeSafeMode("Cannot concat " + target); 1893 stat = FSDirConcatOp.concat(dir, target, srcs, logRetryCache); 1894 success = true; 1895 } finally { 1896 writeUnlock(operationName); 1897 if (success) { 1898 getEditLog().logSync(); 1899 } 1900 logAuditEvent(success, operationName, Arrays.toString(srcs), 1901 target, stat); 1902 } 1903 } 1904 1905 /** 1906 * stores the modification and access time for this inode. 1907 * The access time is precise up to an hour. The transaction, if needed, is 1908 * written to the edits log but is not flushed. 1909 */ 1910 void setTimes(String src, long mtime, long atime) throws IOException { 1911 final String operationName = "setTimes"; 1912 HdfsFileStatus auditStat; 1913 checkOperation(OperationCategory.WRITE); 1914 writeLock(); 1915 try { 1916 checkOperation(OperationCategory.WRITE); 1917 checkNameNodeSafeMode("Cannot set times " + src); 1918 auditStat = FSDirAttrOp.setTimes(dir, src, mtime, atime); 1919 } catch (AccessControlException e) { 1920 logAuditEvent(false, operationName, src); 1921 throw e; 1922 } finally { 1923 writeUnlock(operationName); 1924 } 1925 getEditLog().logSync(); 1926 logAuditEvent(true, operationName, src, null, auditStat); 1927 } 1928 1929 /** 1930 * Create a symbolic link. 1931 */ 1932 @SuppressWarnings("deprecation") 1933 void createSymlink(String target, String link, 1934 PermissionStatus dirPerms, boolean createParent, boolean logRetryCache) 1935 throws IOException { 1936 final String operationName = "createSymlink"; 1937 if (!FileSystem.areSymlinksEnabled()) { 1938 throw new UnsupportedOperationException("Symlinks not supported"); 1939 } 1940 HdfsFileStatus auditStat = null; 1941 checkOperation(OperationCategory.WRITE); 1942 writeLock(); 1943 try { 1944 checkOperation(OperationCategory.WRITE); 1945 checkNameNodeSafeMode("Cannot create symlink " + link); 1946 auditStat = FSDirSymlinkOp.createSymlinkInt(this, target, link, dirPerms, 1947 createParent, logRetryCache); 1948 } catch (AccessControlException e) { 1949 logAuditEvent(false, operationName, link, target, null); 1950 throw e; 1951 } finally { 1952 writeUnlock(operationName); 1953 } 1954 getEditLog().logSync(); 1955 logAuditEvent(true, operationName, link, target, auditStat); 1956 } 1957 1958 /** 1959 * Set replication for an existing file. 1960 * 1961 * The NameNode sets new replication and schedules either replication of 1962 * under-replicated data blocks or removal of the excessive block copies 1963 * if the blocks are over-replicated. 1964 * 1965 * @see ClientProtocol#setReplication(String, short) 1966 * @param src file name 1967 * @param replication new replication 1968 * @return true if successful; 1969 * false if file does not exist or is a directory 1970 */ 1971 boolean setReplication(final String src, final short replication) 1972 throws IOException { 1973 final String operationName = "setReplication"; 1974 boolean success = false; 1975 waitForLoadingFSImage(); 1976 checkOperation(OperationCategory.WRITE); 1977 writeLock(); 1978 try { 1979 checkOperation(OperationCategory.WRITE); 1980 checkNameNodeSafeMode("Cannot set replication for " + src); 1981 success = FSDirAttrOp.setReplication(dir, blockManager, src, replication); 1982 } catch (AccessControlException e) { 1983 logAuditEvent(false, operationName, src); 1984 throw e; 1985 } finally { 1986 writeUnlock(operationName); 1987 } 1988 if (success) { 1989 getEditLog().logSync(); 1990 logAuditEvent(true, operationName, src); 1991 } 1992 return success; 1993 } 1994 1995 /** 1996 * Truncate file to a lower length. 1997 * Truncate cannot be reverted / recovered from as it causes data loss. 1998 * Truncation at block boundary is atomic, otherwise it requires 1999 * block recovery to truncate the last block of the file. 2000 * 2001 * @return true if client does not need to wait for block recovery, 2002 * false if client needs to wait for block recovery. 2003 */ 2004 boolean truncate(String src, long newLength, 2005 String clientName, String clientMachine, 2006 long mtime) 2007 throws IOException, UnresolvedLinkException { 2008 boolean ret; 2009 try { 2010 ret = truncateInt(src, newLength, clientName, clientMachine, mtime); 2011 } catch (AccessControlException e) { 2012 logAuditEvent(false, "truncate", src); 2013 throw e; 2014 } 2015 return ret; 2016 } 2017 2018 boolean truncateInt(String srcArg, long newLength, 2019 String clientName, String clientMachine, 2020 long mtime) 2021 throws IOException, UnresolvedLinkException { 2022 final String operationName = "truncate"; 2023 String src = srcArg; 2024 NameNode.stateChangeLog.debug( 2025 "DIR* NameSystem.truncate: src={} newLength={}", src, newLength); 2026 if (newLength < 0) { 2027 throw new HadoopIllegalArgumentException( 2028 "Cannot truncate to a negative file size: " + newLength + "."); 2029 } 2030 HdfsFileStatus stat = null; 2031 FSPermissionChecker pc = getPermissionChecker(); 2032 checkOperation(OperationCategory.WRITE); 2033 boolean res; 2034 writeLock(); 2035 BlocksMapUpdateInfo toRemoveBlocks = new BlocksMapUpdateInfo(); 2036 try { 2037 checkOperation(OperationCategory.WRITE); 2038 checkNameNodeSafeMode("Cannot truncate for " + src); 2039 INodesInPath iip = dir.resolvePath(pc, src); 2040 src = iip.getPath(); 2041 res = truncateInternal(src, newLength, clientName, 2042 clientMachine, mtime, pc, toRemoveBlocks); 2043 stat = dir.getAuditFileInfo(dir.getINodesInPath4Write(src, false)); 2044 } finally { 2045 writeUnlock(operationName); 2046 } 2047 getEditLog().logSync(); 2048 if (!toRemoveBlocks.getToDeleteList().isEmpty()) { 2049 removeBlocks(toRemoveBlocks); 2050 toRemoveBlocks.clear(); 2051 } 2052 logAuditEvent(true, operationName, src, null, stat); 2053 return res; 2054 } 2055 2056 /** 2057 * Truncate a file to a given size 2058 * Update the count at each ancestor directory with quota 2059 */ 2060 boolean truncateInternal(String src, long newLength, 2061 String clientName, String clientMachine, 2062 long mtime, FSPermissionChecker pc, 2063 BlocksMapUpdateInfo toRemoveBlocks) 2064 throws IOException, UnresolvedLinkException { 2065 assert hasWriteLock(); 2066 INodesInPath iip = dir.getINodesInPath4Write(src, true); 2067 if (isPermissionEnabled) { 2068 dir.checkPathAccess(pc, iip, FsAction.WRITE); 2069 } 2070 INodeFile file = INodeFile.valueOf(iip.getLastINode(), src); 2071 final BlockStoragePolicy lpPolicy = 2072 blockManager.getStoragePolicy("LAZY_PERSIST"); 2073 2074 if (lpPolicy != null && 2075 lpPolicy.getId() == file.getStoragePolicyID()) { 2076 throw new UnsupportedOperationException( 2077 "Cannot truncate lazy persist file " + src); 2078 } 2079 2080 // Check if the file is already being truncated with the same length 2081 final BlockInfoContiguous last = file.getLastBlock(); 2082 if (last != null && last.getBlockUCState() == BlockUCState.UNDER_RECOVERY) { 2083 final Block truncateBlock 2084 = ((BlockInfoContiguousUnderConstruction)last).getTruncateBlock(); 2085 if (truncateBlock != null) { 2086 final long truncateLength = file.computeFileSize(false, false) 2087 + truncateBlock.getNumBytes(); 2088 if (newLength == truncateLength) { 2089 return false; 2090 } 2091 } 2092 } 2093 2094 // Opening an existing file for truncate. May need lease recovery. 2095 recoverLeaseInternal(RecoverLeaseOp.TRUNCATE_FILE, 2096 iip, src, clientName, clientMachine, false); 2097 // Truncate length check. 2098 long oldLength = file.computeFileSize(); 2099 if(oldLength == newLength) { 2100 return true; 2101 } 2102 if(oldLength < newLength) { 2103 throw new HadoopIllegalArgumentException( 2104 "Cannot truncate to a larger file size. Current size: " + oldLength + 2105 ", truncate size: " + newLength + "."); 2106 } 2107 // Perform INodeFile truncation. 2108 final QuotaCounts delta = new QuotaCounts.Builder().build(); 2109 boolean onBlockBoundary = dir.truncate(iip, newLength, toRemoveBlocks, 2110 mtime, delta); 2111 Block truncateBlock = null; 2112 if(!onBlockBoundary) { 2113 // Open file for write, but don't log into edits 2114 long lastBlockDelta = file.computeFileSize() - newLength; 2115 assert lastBlockDelta > 0 : "delta is 0 only if on block bounday"; 2116 truncateBlock = prepareFileForTruncate(iip, clientName, clientMachine, 2117 lastBlockDelta, null); 2118 } 2119 2120 // update the quota: use the preferred block size for UC block 2121 dir.writeLock(); 2122 try { 2123 dir.updateCountNoQuotaCheck(iip, iip.length() - 1, delta); 2124 } finally { 2125 dir.writeUnlock(); 2126 } 2127 2128 getEditLog().logTruncate(src, clientName, clientMachine, newLength, mtime, 2129 truncateBlock); 2130 return onBlockBoundary; 2131 } 2132 2133 /** 2134 * Convert current INode to UnderConstruction. 2135 * Recreate lease. 2136 * Create new block for the truncated copy. 2137 * Schedule truncation of the replicas. 2138 * 2139 * @return the returned block will be written to editLog and passed back into 2140 * this method upon loading. 2141 */ 2142 Block prepareFileForTruncate(INodesInPath iip, 2143 String leaseHolder, 2144 String clientMachine, 2145 long lastBlockDelta, 2146 Block newBlock) 2147 throws IOException { 2148 INodeFile file = iip.getLastINode().asFile(); 2149 String src = iip.getPath(); 2150 file.recordModification(iip.getLatestSnapshotId()); 2151 file.toUnderConstruction(leaseHolder, clientMachine); 2152 assert file.isUnderConstruction() : "inode should be under construction."; 2153 leaseManager.addLease( 2154 file.getFileUnderConstructionFeature().getClientName(), src); 2155 boolean shouldRecoverNow = (newBlock == null); 2156 BlockInfoContiguous oldBlock = file.getLastBlock(); 2157 boolean shouldCopyOnTruncate = shouldCopyOnTruncate(file, oldBlock); 2158 if(newBlock == null) { 2159 newBlock = (shouldCopyOnTruncate) ? createNewBlock() : 2160 new Block(oldBlock.getBlockId(), oldBlock.getNumBytes(), 2161 nextGenerationStamp(blockIdManager.isLegacyBlock(oldBlock))); 2162 } 2163 2164 BlockInfoContiguousUnderConstruction truncatedBlockUC; 2165 if(shouldCopyOnTruncate) { 2166 // Add new truncateBlock into blocksMap and 2167 // use oldBlock as a source for copy-on-truncate recovery 2168 truncatedBlockUC = new BlockInfoContiguousUnderConstruction(newBlock, 2169 file.getBlockReplication()); 2170 truncatedBlockUC.setNumBytes(oldBlock.getNumBytes() - lastBlockDelta); 2171 truncatedBlockUC.setTruncateBlock(oldBlock); 2172 file.setLastBlock(truncatedBlockUC, blockManager.getStorages(oldBlock)); 2173 getBlockManager().addBlockCollection(truncatedBlockUC, file); 2174 2175 NameNode.stateChangeLog.debug( 2176 "BLOCK* prepareFileForTruncate: Scheduling copy-on-truncate to new" + 2177 " size {} new block {} old block {}", truncatedBlockUC.getNumBytes(), 2178 newBlock, truncatedBlockUC.getTruncateBlock()); 2179 } else { 2180 // Use new generation stamp for in-place truncate recovery 2181 blockManager.convertLastBlockToUnderConstruction(file, lastBlockDelta); 2182 oldBlock = file.getLastBlock(); 2183 assert !oldBlock.isComplete() : "oldBlock should be under construction"; 2184 truncatedBlockUC = (BlockInfoContiguousUnderConstruction) oldBlock; 2185 truncatedBlockUC.setTruncateBlock(new Block(oldBlock)); 2186 truncatedBlockUC.getTruncateBlock().setNumBytes( 2187 oldBlock.getNumBytes() - lastBlockDelta); 2188 truncatedBlockUC.getTruncateBlock().setGenerationStamp( 2189 newBlock.getGenerationStamp()); 2190 2191 NameNode.stateChangeLog.debug( 2192 "BLOCK* prepareFileForTruncate: {} Scheduling in-place block " + 2193 "truncate to new size {}", 2194 truncatedBlockUC.getTruncateBlock().getNumBytes(), truncatedBlockUC); 2195 } 2196 if (shouldRecoverNow) { 2197 truncatedBlockUC.initializeBlockRecovery(newBlock.getGenerationStamp()); 2198 } 2199 2200 return newBlock; 2201 } 2202 2203 /** 2204 * Defines if a replica needs to be copied on truncate or 2205 * can be truncated in place. 2206 */ 2207 boolean shouldCopyOnTruncate(INodeFile file, BlockInfoContiguous blk) { 2208 if(!isUpgradeFinalized()) { 2209 return true; 2210 } 2211 if (isRollingUpgrade()) { 2212 return true; 2213 } 2214 return file.isBlockInLatestSnapshot(blk); 2215 } 2216 2217 /** 2218 * Set the storage policy for a file or a directory. 2219 * 2220 * @param src file/directory path 2221 * @param policyName storage policy name 2222 */ 2223 void setStoragePolicy(String src, String policyName) throws IOException { 2224 HdfsFileStatus auditStat; 2225 waitForLoadingFSImage(); 2226 checkOperation(OperationCategory.WRITE); 2227 final String operationName = "setStoragePolicy"; 2228 writeLock(); 2229 try { 2230 checkOperation(OperationCategory.WRITE); 2231 checkNameNodeSafeMode("Cannot set storage policy for " + src); 2232 auditStat = FSDirAttrOp.setStoragePolicy( 2233 dir, blockManager, src, policyName); 2234 } catch (AccessControlException e) { 2235 logAuditEvent(false, operationName, src); 2236 throw e; 2237 } finally { 2238 writeUnlock(operationName); 2239 } 2240 getEditLog().logSync(); 2241 logAuditEvent(true, operationName, src, null, auditStat); 2242 } 2243 2244 /** 2245 * @return All the existing block storage policies 2246 */ 2247 BlockStoragePolicy[] getStoragePolicies() throws IOException { 2248 checkOperation(OperationCategory.READ); 2249 waitForLoadingFSImage(); 2250 readLock(); 2251 try { 2252 checkOperation(OperationCategory.READ); 2253 return FSDirAttrOp.getStoragePolicies(blockManager); 2254 } finally { 2255 readUnlock("getStoragePolicies"); 2256 } 2257 } 2258 2259 long getPreferredBlockSize(String src) throws IOException { 2260 checkOperation(OperationCategory.READ); 2261 readLock(); 2262 try { 2263 checkOperation(OperationCategory.READ); 2264 return FSDirAttrOp.getPreferredBlockSize(dir, src); 2265 } finally { 2266 readUnlock("getPreferredBlockSize"); 2267 } 2268 } 2269 2270 /** 2271 * If the file is within an encryption zone, select the appropriate 2272 * CryptoProtocolVersion from the list provided by the client. Since the 2273 * client may be newer, we need to handle unknown versions. 2274 * 2275 * @param zone EncryptionZone of the file 2276 * @param supportedVersions List of supported protocol versions 2277 * @return chosen protocol version 2278 * @throws IOException 2279 */ 2280 private CryptoProtocolVersion chooseProtocolVersion(EncryptionZone zone, 2281 CryptoProtocolVersion[] supportedVersions) 2282 throws UnknownCryptoProtocolVersionException, UnresolvedLinkException, 2283 SnapshotAccessControlException { 2284 Preconditions.checkNotNull(zone); 2285 Preconditions.checkNotNull(supportedVersions); 2286 // Right now, we only support a single protocol version, 2287 // so simply look for it in the list of provided options 2288 final CryptoProtocolVersion required = zone.getVersion(); 2289 2290 for (CryptoProtocolVersion c : supportedVersions) { 2291 if (c.equals(CryptoProtocolVersion.UNKNOWN)) { 2292 if (LOG.isDebugEnabled()) { 2293 LOG.debug("Ignoring unknown CryptoProtocolVersion provided by " + 2294 "client: " + c.getUnknownValue()); 2295 } 2296 continue; 2297 } 2298 if (c.equals(required)) { 2299 return c; 2300 } 2301 } 2302 throw new UnknownCryptoProtocolVersionException( 2303 "No crypto protocol versions provided by the client are supported." 2304 + " Client provided: " + Arrays.toString(supportedVersions) 2305 + " NameNode supports: " + Arrays.toString(CryptoProtocolVersion 2306 .values())); 2307 } 2308 2309 /** 2310 * Invoke KeyProvider APIs to generate an encrypted data encryption key for an 2311 * encryption zone. Should not be called with any locks held. 2312 * 2313 * @param ezKeyName key name of an encryption zone 2314 * @return New EDEK, or null if ezKeyName is null 2315 * @throws IOException 2316 */ 2317 private EncryptedKeyVersion generateEncryptedDataEncryptionKey(String 2318 ezKeyName) throws IOException { 2319 if (ezKeyName == null) { 2320 return null; 2321 } 2322 EncryptedKeyVersion edek = null; 2323 try { 2324 edek = provider.generateEncryptedKey(ezKeyName); 2325 } catch (GeneralSecurityException e) { 2326 throw new IOException(e); 2327 } 2328 Preconditions.checkNotNull(edek); 2329 return edek; 2330 } 2331 2332 /** 2333 * Create a new file entry in the namespace. 2334 * 2335 * For description of parameters and exceptions thrown see 2336 * {@link ClientProtocol#create}, except it returns valid file status upon 2337 * success 2338 */ 2339 HdfsFileStatus startFile(String src, PermissionStatus permissions, 2340 String holder, String clientMachine, EnumSet<CreateFlag> flag, 2341 boolean createParent, short replication, long blockSize, 2342 CryptoProtocolVersion[] supportedVersions, boolean logRetryCache) 2343 throws AccessControlException, SafeModeException, 2344 FileAlreadyExistsException, UnresolvedLinkException, 2345 FileNotFoundException, ParentNotDirectoryException, IOException { 2346 2347 HdfsFileStatus status = null; 2348 try { 2349 status = startFileInt(src, permissions, holder, clientMachine, flag, 2350 createParent, replication, blockSize, supportedVersions, 2351 logRetryCache); 2352 } catch (AccessControlException e) { 2353 logAuditEvent(false, "create", src); 2354 throw e; 2355 } 2356 return status; 2357 } 2358 2359 private HdfsFileStatus startFileInt(final String srcArg, 2360 PermissionStatus permissions, String holder, String clientMachine, 2361 EnumSet<CreateFlag> flag, boolean createParent, short replication, 2362 long blockSize, CryptoProtocolVersion[] supportedVersions, 2363 boolean logRetryCache) 2364 throws AccessControlException, SafeModeException, 2365 FileAlreadyExistsException, UnresolvedLinkException, 2366 FileNotFoundException, ParentNotDirectoryException, IOException { 2367 String src = srcArg; 2368 final String operationName = "create"; 2369 if (NameNode.stateChangeLog.isDebugEnabled()) { 2370 StringBuilder builder = new StringBuilder(); 2371 builder.append("DIR* NameSystem.startFile: src=" + src 2372 + ", holder=" + holder 2373 + ", clientMachine=" + clientMachine 2374 + ", createParent=" + createParent 2375 + ", replication=" + replication 2376 + ", createFlag=" + flag.toString() 2377 + ", blockSize=" + blockSize); 2378 builder.append(", supportedVersions="); 2379 if (supportedVersions != null) { 2380 builder.append(Arrays.toString(supportedVersions)); 2381 } else { 2382 builder.append("null"); 2383 } 2384 NameNode.stateChangeLog.debug(builder.toString()); 2385 } 2386 if (!DFSUtil.isValidName(src)) { 2387 throw new InvalidPathException(src); 2388 } 2389 blockManager.verifyReplication(src, replication, clientMachine); 2390 2391 boolean skipSync = false; 2392 HdfsFileStatus stat = null; 2393 FSPermissionChecker pc = getPermissionChecker(); 2394 if (blockSize < minBlockSize) { 2395 throw new IOException("Specified block size is less than configured" + 2396 " minimum value (" + DFSConfigKeys.DFS_NAMENODE_MIN_BLOCK_SIZE_KEY 2397 + "): " + blockSize + " < " + minBlockSize); 2398 } 2399 boolean create = flag.contains(CreateFlag.CREATE); 2400 boolean overwrite = flag.contains(CreateFlag.OVERWRITE); 2401 boolean isLazyPersist = flag.contains(CreateFlag.LAZY_PERSIST); 2402 2403 waitForLoadingFSImage(); 2404 2405 /** 2406 * If the file is in an encryption zone, we optimistically create an 2407 * EDEK for the file by calling out to the configured KeyProvider. 2408 * Since this typically involves doing an RPC, we take the readLock 2409 * initially, then drop it to do the RPC. 2410 * 2411 * Since the path can flip-flop between being in an encryption zone and not 2412 * in the meantime, we need to recheck the preconditions when we retake the 2413 * lock to do the create. If the preconditions are not met, we throw a 2414 * special RetryStartFileException to ask the DFSClient to try the create 2415 * again later. 2416 */ 2417 CryptoProtocolVersion protocolVersion = null; 2418 CipherSuite suite = null; 2419 String ezKeyName = null; 2420 EncryptedKeyVersion edek = null; 2421 2422 if (provider != null) { 2423 readLock(); 2424 try { 2425 INodesInPath iip = dir.resolvePathForWrite(pc, src); 2426 src = iip.getPath(); 2427 // Nothing to do if the path is not within an EZ 2428 final EncryptionZone zone = dir.getEZForPath(iip); 2429 if (zone != null) { 2430 protocolVersion = chooseProtocolVersion(zone, supportedVersions); 2431 suite = zone.getSuite(); 2432 ezKeyName = zone.getKeyName(); 2433 2434 Preconditions.checkNotNull(protocolVersion); 2435 Preconditions.checkNotNull(suite); 2436 Preconditions.checkArgument(!suite.equals(CipherSuite.UNKNOWN), 2437 "Chose an UNKNOWN CipherSuite!"); 2438 Preconditions.checkNotNull(ezKeyName); 2439 } 2440 } finally { 2441 readUnlock(operationName); 2442 } 2443 2444 Preconditions.checkState( 2445 (suite == null && ezKeyName == null) || 2446 (suite != null && ezKeyName != null), 2447 "Both suite and ezKeyName should both be null or not null"); 2448 2449 // Generate EDEK if necessary while not holding the lock 2450 edek = generateEncryptedDataEncryptionKey(ezKeyName); 2451 EncryptionFaultInjector.getInstance().startFileAfterGenerateKey(); 2452 } 2453 2454 // Proceed with the create, using the computed cipher suite and 2455 // generated EDEK 2456 BlocksMapUpdateInfo toRemoveBlocks = null; 2457 writeLock(); 2458 try { 2459 checkOperation(OperationCategory.WRITE); 2460 checkNameNodeSafeMode("Cannot create file" + src); 2461 dir.writeLock(); 2462 try { 2463 final INodesInPath iip = dir.resolvePathForWrite(pc, src); 2464 src = iip.getPath(); 2465 toRemoveBlocks = startFileInternal( 2466 pc, iip, permissions, holder, 2467 clientMachine, create, overwrite, 2468 createParent, replication, blockSize, 2469 isLazyPersist, suite, protocolVersion, edek, 2470 logRetryCache); 2471 stat = FSDirStatAndListingOp.getFileInfo( 2472 dir, src, false, FSDirectory.isReservedRawName(srcArg)); 2473 } finally { 2474 dir.writeUnlock(); 2475 } 2476 } catch (StandbyException se) { 2477 skipSync = true; 2478 throw se; 2479 } finally { 2480 writeUnlock(operationName); 2481 // There might be transactions logged while trying to recover the lease. 2482 // They need to be sync'ed even when an exception was thrown. 2483 if (!skipSync) { 2484 getEditLog().logSync(); 2485 if (toRemoveBlocks != null) { 2486 removeBlocks(toRemoveBlocks); 2487 toRemoveBlocks.clear(); 2488 } 2489 } 2490 } 2491 2492 logAuditEvent(true, operationName, srcArg, null, stat); 2493 return stat; 2494 } 2495 2496 /** 2497 * Create a new file or overwrite an existing file<br> 2498 * 2499 * Once the file is create the client then allocates a new block with the next 2500 * call using {@link ClientProtocol#addBlock}. 2501 * <p> 2502 * For description of parameters and exceptions thrown see 2503 * {@link ClientProtocol#create} 2504 */ 2505 private BlocksMapUpdateInfo startFileInternal(FSPermissionChecker pc, 2506 INodesInPath iip, PermissionStatus permissions, String holder, 2507 String clientMachine, boolean create, boolean overwrite, 2508 boolean createParent, short replication, long blockSize, 2509 boolean isLazyPersist, CipherSuite suite, CryptoProtocolVersion version, 2510 EncryptedKeyVersion edek, boolean logRetryEntry) 2511 throws IOException { 2512 assert hasWriteLock(); 2513 // Verify that the destination does not exist as a directory already. 2514 final INode inode = iip.getLastINode(); 2515 final String src = iip.getPath(); 2516 if (inode != null && inode.isDirectory()) { 2517 throw new FileAlreadyExistsException(src + 2518 " already exists as a directory"); 2519 } 2520 2521 final INodeFile myFile = INodeFile.valueOf(inode, src, true); 2522 if (isPermissionEnabled) { 2523 if (overwrite && myFile != null) { 2524 dir.checkPathAccess(pc, iip, FsAction.WRITE); 2525 } 2526 /* 2527 * To overwrite existing file, need to check 'w' permission 2528 * of parent (equals to ancestor in this case) 2529 */ 2530 dir.checkAncestorAccess(pc, iip, FsAction.WRITE); 2531 } 2532 if (!createParent) { 2533 dir.verifyParentDir(iip, src); 2534 } 2535 2536 FileEncryptionInfo feInfo = null; 2537 2538 final EncryptionZone zone = dir.getEZForPath(iip); 2539 if (zone != null) { 2540 // The path is now within an EZ, but we're missing encryption parameters 2541 if (suite == null || edek == null) { 2542 throw new RetryStartFileException(); 2543 } 2544 // Path is within an EZ and we have provided encryption parameters. 2545 // Make sure that the generated EDEK matches the settings of the EZ. 2546 final String ezKeyName = zone.getKeyName(); 2547 if (!ezKeyName.equals(edek.getEncryptionKeyName())) { 2548 throw new RetryStartFileException(); 2549 } 2550 feInfo = new FileEncryptionInfo(suite, version, 2551 edek.getEncryptedKeyVersion().getMaterial(), 2552 edek.getEncryptedKeyIv(), 2553 ezKeyName, edek.getEncryptionKeyVersionName()); 2554 } 2555 2556 try { 2557 BlocksMapUpdateInfo toRemoveBlocks = null; 2558 if (myFile == null) { 2559 if (!create) { 2560 throw new FileNotFoundException("Can't overwrite non-existent " + 2561 src + " for client " + clientMachine); 2562 } 2563 } else { 2564 if (overwrite) { 2565 toRemoveBlocks = new BlocksMapUpdateInfo(); 2566 List<INode> toRemoveINodes = new ChunkedArrayList<INode>(); 2567 long ret = FSDirDeleteOp.delete(dir, iip, toRemoveBlocks, 2568 toRemoveINodes, now()); 2569 if (ret >= 0) { 2570 iip = INodesInPath.replace(iip, iip.length() - 1, null); 2571 FSDirDeleteOp.incrDeletedFileCount(ret); 2572 removeLeasesAndINodes(src, toRemoveINodes, true); 2573 } 2574 } else { 2575 // If lease soft limit time is expired, recover the lease 2576 recoverLeaseInternal(RecoverLeaseOp.CREATE_FILE, 2577 iip, src, holder, clientMachine, false); 2578 throw new FileAlreadyExistsException(src + " for client " + 2579 clientMachine + " already exists"); 2580 } 2581 } 2582 2583 checkFsObjectLimit(); 2584 INodeFile newNode = null; 2585 2586 // Always do an implicit mkdirs for parent directory tree. 2587 Map.Entry<INodesInPath, String> parent = FSDirMkdirOp 2588 .createAncestorDirectories(dir, iip, permissions); 2589 if (parent != null) { 2590 iip = dir.addFile(parent.getKey(), parent.getValue(), permissions, 2591 replication, blockSize, holder, clientMachine); 2592 newNode = iip != null ? iip.getLastINode().asFile() : null; 2593 } 2594 2595 if (newNode == null) { 2596 throw new IOException("Unable to add " + src + " to namespace"); 2597 } 2598 leaseManager.addLease(newNode.getFileUnderConstructionFeature() 2599 .getClientName(), src); 2600 2601 // Set encryption attributes if necessary 2602 if (feInfo != null) { 2603 dir.setFileEncryptionInfo(src, feInfo); 2604 newNode = dir.getInode(newNode.getId()).asFile(); 2605 } 2606 2607 setNewINodeStoragePolicy(newNode, iip, isLazyPersist); 2608 2609 // record file record in log, record new generation stamp 2610 getEditLog().logOpenFile(src, newNode, overwrite, logRetryEntry); 2611 NameNode.stateChangeLog.debug("DIR* NameSystem.startFile: added {}" + 2612 " inode {} holder {}", src, newNode.getId(), holder); 2613 return toRemoveBlocks; 2614 } catch (IOException ie) { 2615 NameNode.stateChangeLog.warn("DIR* NameSystem.startFile: " + src + " " + 2616 ie.getMessage()); 2617 throw ie; 2618 } 2619 } 2620 2621 private void setNewINodeStoragePolicy(INodeFile inode, 2622 INodesInPath iip, 2623 boolean isLazyPersist) 2624 throws IOException { 2625 2626 if (isLazyPersist) { 2627 BlockStoragePolicy lpPolicy = 2628 blockManager.getStoragePolicy("LAZY_PERSIST"); 2629 2630 // Set LAZY_PERSIST storage policy if the flag was passed to 2631 // CreateFile. 2632 if (lpPolicy == null) { 2633 throw new HadoopIllegalArgumentException( 2634 "The LAZY_PERSIST storage policy has been disabled " + 2635 "by the administrator."); 2636 } 2637 inode.setStoragePolicyID(lpPolicy.getId(), 2638 iip.getLatestSnapshotId()); 2639 } else { 2640 BlockStoragePolicy effectivePolicy = 2641 blockManager.getStoragePolicy(inode.getStoragePolicyID()); 2642 2643 if (effectivePolicy != null && 2644 effectivePolicy.isCopyOnCreateFile()) { 2645 // Copy effective policy from ancestor directory to current file. 2646 inode.setStoragePolicyID(effectivePolicy.getId(), 2647 iip.getLatestSnapshotId()); 2648 } 2649 } 2650 } 2651 2652 /** 2653 * Append to an existing file for append. 2654 * <p> 2655 * 2656 * The method returns the last block of the file if this is a partial block, 2657 * which can still be used for writing more data. The client uses the returned 2658 * block locations to form the data pipeline for this block.<br> 2659 * The method returns null if the last block is full. The client then 2660 * allocates a new block with the next call using 2661 * {@link ClientProtocol#addBlock}. 2662 * <p> 2663 * 2664 * For description of parameters and exceptions thrown see 2665 * {@link ClientProtocol#append(String, String, EnumSetWritable)} 2666 * 2667 * @return the last block locations if the block is partial or null otherwise 2668 */ 2669 private LocatedBlock appendFileInternal(FSPermissionChecker pc, 2670 INodesInPath iip, String holder, String clientMachine, boolean newBlock, 2671 boolean logRetryCache) throws IOException { 2672 assert hasWriteLock(); 2673 // Verify that the destination does not exist as a directory already. 2674 final INode inode = iip.getLastINode(); 2675 final String src = iip.getPath(); 2676 if (inode != null && inode.isDirectory()) { 2677 throw new FileAlreadyExistsException("Cannot append to directory " + src 2678 + "; already exists as a directory."); 2679 } 2680 if (isPermissionEnabled) { 2681 dir.checkPathAccess(pc, iip, FsAction.WRITE); 2682 } 2683 2684 try { 2685 if (inode == null) { 2686 throw new FileNotFoundException("failed to append to non-existent file " 2687 + src + " for client " + clientMachine); 2688 } 2689 INodeFile myFile = INodeFile.valueOf(inode, src, true); 2690 final BlockStoragePolicy lpPolicy = 2691 blockManager.getStoragePolicy("LAZY_PERSIST"); 2692 if (lpPolicy != null && 2693 lpPolicy.getId() == myFile.getStoragePolicyID()) { 2694 throw new UnsupportedOperationException( 2695 "Cannot append to lazy persist file " + src); 2696 } 2697 // Opening an existing file for append - may need to recover lease. 2698 recoverLeaseInternal(RecoverLeaseOp.APPEND_FILE, 2699 iip, src, holder, clientMachine, false); 2700 2701 final BlockInfoContiguous lastBlock = myFile.getLastBlock(); 2702 // Check that the block has at least minimum replication. 2703 if(lastBlock != null && lastBlock.isComplete() && 2704 !getBlockManager().isSufficientlyReplicated(lastBlock)) { 2705 throw new IOException("append: lastBlock=" + lastBlock + 2706 " of src=" + src + " is not sufficiently replicated yet."); 2707 } 2708 return prepareFileForAppend(src, iip, holder, clientMachine, newBlock, 2709 true, logRetryCache); 2710 } catch (IOException ie) { 2711 NameNode.stateChangeLog.warn("DIR* NameSystem.append: " +ie.getMessage()); 2712 throw ie; 2713 } 2714 } 2715 2716 /** 2717 * Convert current node to under construction. 2718 * Recreate in-memory lease record. 2719 * 2720 * @param src path to the file 2721 * @param leaseHolder identifier of the lease holder on this file 2722 * @param clientMachine identifier of the client machine 2723 * @param newBlock if the data is appended to a new block 2724 * @param writeToEditLog whether to persist this change to the edit log 2725 * @param logRetryCache whether to record RPC ids in editlog for retry cache 2726 * rebuilding 2727 * @return the last block locations if the block is partial or null otherwise 2728 * @throws UnresolvedLinkException 2729 * @throws IOException 2730 */ 2731 LocatedBlock prepareFileForAppend(String src, INodesInPath iip, 2732 String leaseHolder, String clientMachine, boolean newBlock, 2733 boolean writeToEditLog, boolean logRetryCache) throws IOException { 2734 final INodeFile file = iip.getLastINode().asFile(); 2735 final QuotaCounts delta = verifyQuotaForUCBlock(file, iip); 2736 2737 file.recordModification(iip.getLatestSnapshotId()); 2738 file.toUnderConstruction(leaseHolder, clientMachine); 2739 2740 leaseManager.addLease( 2741 file.getFileUnderConstructionFeature().getClientName(), src); 2742 2743 LocatedBlock ret = null; 2744 if (!newBlock) { 2745 ret = blockManager.convertLastBlockToUnderConstruction(file, 0); 2746 if (ret != null && delta != null) { 2747 Preconditions.checkState(delta.getStorageSpace() >= 0, 2748 "appending to a block with size larger than the preferred block size"); 2749 dir.writeLock(); 2750 try { 2751 dir.updateCountNoQuotaCheck(iip, iip.length() - 1, delta); 2752 } finally { 2753 dir.writeUnlock(); 2754 } 2755 } 2756 } else { 2757 BlockInfoContiguous lastBlock = file.getLastBlock(); 2758 if (lastBlock != null) { 2759 ExtendedBlock blk = new ExtendedBlock(this.getBlockPoolId(), lastBlock); 2760 ret = new LocatedBlock(blk, new DatanodeInfo[0]); 2761 } 2762 } 2763 2764 if (writeToEditLog) { 2765 getEditLog().logAppendFile(src, file, newBlock, logRetryCache); 2766 } 2767 return ret; 2768 } 2769 2770 /** 2771 * Verify quota when using the preferred block size for UC block. This is 2772 * usually used by append and truncate 2773 * @throws QuotaExceededException when violating the storage quota 2774 * @return expected quota usage update. null means no change or no need to 2775 * update quota usage later 2776 */ 2777 private QuotaCounts verifyQuotaForUCBlock(INodeFile file, INodesInPath iip) 2778 throws QuotaExceededException { 2779 if (!isImageLoaded() || dir.shouldSkipQuotaChecks()) { 2780 // Do not check quota if editlog is still being processed 2781 return null; 2782 } 2783 if (file.getLastBlock() != null) { 2784 final QuotaCounts delta = computeQuotaDeltaForUCBlock(file); 2785 dir.readLock(); 2786 try { 2787 FSDirectory.verifyQuota(iip, iip.length() - 1, delta, null); 2788 return delta; 2789 } finally { 2790 dir.readUnlock(); 2791 } 2792 } 2793 return null; 2794 } 2795 2796 /** Compute quota change for converting a complete block to a UC block */ 2797 private QuotaCounts computeQuotaDeltaForUCBlock(INodeFile file) { 2798 final QuotaCounts delta = new QuotaCounts.Builder().build(); 2799 final BlockInfoContiguous lastBlock = file.getLastBlock(); 2800 if (lastBlock != null) { 2801 final long diff = file.getPreferredBlockSize() - lastBlock.getNumBytes(); 2802 final short repl = file.getBlockReplication(); 2803 delta.addStorageSpace(diff * repl); 2804 final BlockStoragePolicy policy = dir.getBlockStoragePolicySuite() 2805 .getPolicy(file.getStoragePolicyID()); 2806 List<StorageType> types = policy.chooseStorageTypes(repl); 2807 for (StorageType t : types) { 2808 if (t.supportTypeQuota()) { 2809 delta.addTypeSpace(t, diff); 2810 } 2811 } 2812 } 2813 return delta; 2814 } 2815 2816 /** 2817 * Recover lease; 2818 * Immediately revoke the lease of the current lease holder and start lease 2819 * recovery so that the file can be forced to be closed. 2820 * 2821 * @param src the path of the file to start lease recovery 2822 * @param holder the lease holder's name 2823 * @param clientMachine the client machine's name 2824 * @return true if the file is already closed or 2825 * if the lease can be released and the file can be closed. 2826 * @throws IOException 2827 */ 2828 boolean recoverLease(String src, String holder, String clientMachine) 2829 throws IOException { 2830 if (!DFSUtil.isValidName(src)) { 2831 throw new IOException("Invalid file name: " + src); 2832 } 2833 2834 boolean skipSync = false; 2835 FSPermissionChecker pc = getPermissionChecker(); 2836 checkOperation(OperationCategory.WRITE); 2837 writeLock(); 2838 try { 2839 checkOperation(OperationCategory.WRITE); 2840 checkNameNodeSafeMode("Cannot recover the lease of " + src); 2841 final INodesInPath iip = dir.resolvePathForWrite(pc, src); 2842 src = iip.getPath(); 2843 final INodeFile inode = INodeFile.valueOf(iip.getLastINode(), src); 2844 if (!inode.isUnderConstruction()) { 2845 return true; 2846 } 2847 if (isPermissionEnabled) { 2848 dir.checkPathAccess(pc, iip, FsAction.WRITE); 2849 } 2850 2851 return recoverLeaseInternal(RecoverLeaseOp.RECOVER_LEASE, 2852 iip, src, holder, clientMachine, true); 2853 } catch (StandbyException se) { 2854 skipSync = true; 2855 throw se; 2856 } finally { 2857 writeUnlock("recoverLease"); 2858 // There might be transactions logged while trying to recover the lease. 2859 // They need to be sync'ed even when an exception was thrown. 2860 if (!skipSync) { 2861 getEditLog().logSync(); 2862 } 2863 } 2864 } 2865 2866 private enum RecoverLeaseOp { 2867 CREATE_FILE, 2868 APPEND_FILE, 2869 TRUNCATE_FILE, 2870 RECOVER_LEASE; 2871 2872 private String getExceptionMessage(String src, String holder, 2873 String clientMachine, String reason) { 2874 return "Failed to " + this + " " + src + " for " + holder + 2875 " on " + clientMachine + " because " + reason; 2876 } 2877 } 2878 2879 boolean recoverLeaseInternal(RecoverLeaseOp op, INodesInPath iip, 2880 String src, String holder, String clientMachine, boolean force) 2881 throws IOException { 2882 assert hasWriteLock(); 2883 INodeFile file = iip.getLastINode().asFile(); 2884 if (file.isUnderConstruction()) { 2885 // 2886 // If the file is under construction , then it must be in our 2887 // leases. Find the appropriate lease record. 2888 // 2889 Lease lease = leaseManager.getLease(holder); 2890 2891 if (!force && lease != null) { 2892 Lease leaseFile = leaseManager.getLeaseByPath(src); 2893 if (leaseFile != null && leaseFile.equals(lease)) { 2894 // We found the lease for this file but the original 2895 // holder is trying to obtain it again. 2896 throw new AlreadyBeingCreatedException( 2897 op.getExceptionMessage(src, holder, clientMachine, 2898 holder + " is already the current lease holder.")); 2899 } 2900 } 2901 // 2902 // Find the original holder. 2903 // 2904 FileUnderConstructionFeature uc = file.getFileUnderConstructionFeature(); 2905 String clientName = uc.getClientName(); 2906 lease = leaseManager.getLease(clientName); 2907 if (lease == null) { 2908 throw new AlreadyBeingCreatedException( 2909 op.getExceptionMessage(src, holder, clientMachine, 2910 "the file is under construction but no leases found.")); 2911 } 2912 if (force) { 2913 // close now: no need to wait for soft lease expiration and 2914 // close only the file src 2915 LOG.info("recoverLease: " + lease + ", src=" + src + 2916 " from client " + clientName); 2917 return internalReleaseLease(lease, src, iip, holder); 2918 } else { 2919 assert lease.getHolder().equals(clientName) : 2920 "Current lease holder " + lease.getHolder() + 2921 " does not match file creator " + clientName; 2922 // 2923 // If the original holder has not renewed in the last SOFTLIMIT 2924 // period, then start lease recovery. 2925 // 2926 if (lease.expiredSoftLimit()) { 2927 LOG.info("startFile: recover " + lease + ", src=" + src + " client " 2928 + clientName); 2929 if (internalReleaseLease(lease, src, iip, null)) { 2930 return true; 2931 } else { 2932 throw new RecoveryInProgressException( 2933 op.getExceptionMessage(src, holder, clientMachine, 2934 "lease recovery is in progress. Try again later.")); 2935 } 2936 } else { 2937 final BlockInfoContiguous lastBlock = file.getLastBlock(); 2938 if (lastBlock != null 2939 && lastBlock.getBlockUCState() == BlockUCState.UNDER_RECOVERY) { 2940 throw new RecoveryInProgressException( 2941 op.getExceptionMessage(src, holder, clientMachine, 2942 "another recovery is in progress by " 2943 + clientName + " on " + uc.getClientMachine())); 2944 } else { 2945 throw new AlreadyBeingCreatedException( 2946 op.getExceptionMessage(src, holder, clientMachine, 2947 "this file lease is currently owned by " 2948 + clientName + " on " + uc.getClientMachine())); 2949 } 2950 } 2951 } 2952 } else { 2953 return true; 2954 } 2955 } 2956 2957 /** 2958 * Append to an existing file in the namespace. 2959 */ 2960 LastBlockWithStatus appendFile(String src, String holder, 2961 String clientMachine, EnumSet<CreateFlag> flag, boolean logRetryCache) 2962 throws IOException { 2963 try { 2964 return appendFileInt(src, holder, clientMachine, 2965 flag.contains(CreateFlag.NEW_BLOCK), logRetryCache); 2966 } catch (AccessControlException e) { 2967 logAuditEvent(false, "append", src); 2968 throw e; 2969 } 2970 } 2971 2972 private LastBlockWithStatus appendFileInt(final String srcArg, String holder, 2973 String clientMachine, boolean newBlock, boolean logRetryCache) 2974 throws IOException { 2975 String src = srcArg; 2976 final String operationName = "append"; 2977 NameNode.stateChangeLog.debug( 2978 "DIR* NameSystem.appendFile: src={}, holder={}, clientMachine={}", 2979 src, holder, clientMachine); 2980 boolean skipSync = false; 2981 if (!supportAppends) { 2982 throw new UnsupportedOperationException( 2983 "Append is not enabled on this NameNode. Use the " + 2984 DFS_SUPPORT_APPEND_KEY + " configuration option to enable it."); 2985 } 2986 2987 LocatedBlock lb = null; 2988 HdfsFileStatus stat = null; 2989 FSPermissionChecker pc = getPermissionChecker(); 2990 writeLock(); 2991 try { 2992 checkOperation(OperationCategory.WRITE); 2993 checkNameNodeSafeMode("Cannot append to file" + src); 2994 final INodesInPath iip = dir.resolvePathForWrite(pc, src); 2995 src = iip.getPath(); 2996 lb = appendFileInternal(pc, iip, holder, clientMachine, newBlock, 2997 logRetryCache); 2998 stat = FSDirStatAndListingOp.getFileInfo(dir, src, false, 2999 FSDirectory.isReservedRawName(srcArg)); 3000 } catch (StandbyException se) { 3001 skipSync = true; 3002 throw se; 3003 } finally { 3004 writeUnlock(operationName); 3005 // There might be transactions logged while trying to recover the lease. 3006 // They need to be sync'ed even when an exception was thrown. 3007 if (!skipSync) { 3008 getEditLog().logSync(); 3009 } 3010 } 3011 if (lb != null) { 3012 NameNode.stateChangeLog.debug( 3013 "DIR* NameSystem.appendFile: file {} for {} at {} block {} block" + 3014 " size {}", src, holder, clientMachine, lb.getBlock(), 3015 lb.getBlock().getNumBytes()); 3016 } 3017 logAuditEvent(true, operationName, srcArg); 3018 return new LastBlockWithStatus(lb, stat); 3019 } 3020 3021 ExtendedBlock getExtendedBlock(Block blk) { 3022 return new ExtendedBlock(blockPoolId, blk); 3023 } 3024 3025 void setBlockPoolId(String bpid) { 3026 blockPoolId = bpid; 3027 blockManager.setBlockPoolId(blockPoolId); 3028 } 3029 3030 /** 3031 * The client would like to obtain an additional block for the indicated 3032 * filename (which is being written-to). Return an array that consists 3033 * of the block, plus a set of machines. The first on this list should 3034 * be where the client writes data. Subsequent items in the list must 3035 * be provided in the connection to the first datanode. 3036 * 3037 * Make sure the previous blocks have been reported by datanodes and 3038 * are replicated. Will return an empty 2-elt array if we want the 3039 * client to "try again later". 3040 */ 3041 LocatedBlock getAdditionalBlock(String src, long fileId, String clientName, 3042 ExtendedBlock previous, Set<Node> excludedNodes, 3043 List<String> favoredNodes) throws IOException { 3044 LocatedBlock[] onRetryBlock = new LocatedBlock[1]; 3045 DatanodeStorageInfo targets[] = getNewBlockTargets(src, fileId, 3046 clientName, previous, excludedNodes, favoredNodes, onRetryBlock); 3047 if (targets == null) { 3048 assert onRetryBlock[0] != null : "Retry block is null"; 3049 // This is a retry. Just return the last block. 3050 return onRetryBlock[0]; 3051 } 3052 LocatedBlock newBlock = storeAllocatedBlock( 3053 src, fileId, clientName, previous, targets); 3054 return newBlock; 3055 } 3056 3057 /** 3058 * Part I of getAdditionalBlock(). 3059 * Analyze the state of the file under read lock to determine if the client 3060 * can add a new block, detect potential retries, lease mismatches, 3061 * and minimal replication of the penultimate block. 3062 * 3063 * Generate target DataNode locations for the new block, 3064 * but do not create the new block yet. 3065 */ 3066 DatanodeStorageInfo[] getNewBlockTargets(String src, long fileId, 3067 String clientName, ExtendedBlock previous, Set<Node> excludedNodes, 3068 List<String> favoredNodes, LocatedBlock[] onRetryBlock) throws IOException { 3069 final long blockSize; 3070 final int replication; 3071 final byte storagePolicyID; 3072 Node clientNode = null; 3073 String clientMachine = null; 3074 3075 NameNode.stateChangeLog.debug("BLOCK* getAdditionalBlock: {} inodeId {}" + 3076 " for {}", src, fileId, clientName); 3077 3078 checkOperation(OperationCategory.READ); 3079 FSPermissionChecker pc = getPermissionChecker(); 3080 readLock(); 3081 try { 3082 checkOperation(OperationCategory.READ); 3083 INodesInPath iip = dir.resolvePath(pc, src, fileId); 3084 src = iip.getPath(); 3085 FileState fileState = analyzeFileState( 3086 iip, fileId, clientName, previous, onRetryBlock); 3087 if (onRetryBlock[0] != null && onRetryBlock[0].getLocations().length > 0) { 3088 // This is a retry. No need to generate new locations. 3089 // Use the last block if it has locations. 3090 return null; 3091 } 3092 3093 final INodeFile pendingFile = fileState.inode; 3094 if (!checkFileProgress(src, pendingFile, false)) { 3095 throw new NotReplicatedYetException("Not replicated yet: " + src); 3096 } 3097 src = fileState.path; 3098 3099 if (pendingFile.getBlocks().length >= maxBlocksPerFile) { 3100 throw new IOException("File has reached the limit on maximum number of" 3101 + " blocks (" + DFSConfigKeys.DFS_NAMENODE_MAX_BLOCKS_PER_FILE_KEY 3102 + "): " + pendingFile.getBlocks().length + " >= " 3103 + maxBlocksPerFile); 3104 } 3105 blockSize = pendingFile.getPreferredBlockSize(); 3106 clientMachine = pendingFile.getFileUnderConstructionFeature() 3107 .getClientMachine(); 3108 clientNode = blockManager.getDatanodeManager().getDatanodeByHost( 3109 clientMachine); 3110 replication = pendingFile.getFileReplication(); 3111 storagePolicyID = pendingFile.getStoragePolicyID(); 3112 } finally { 3113 readUnlock("getNewBlockTargets"); 3114 } 3115 3116 if (clientNode == null) { 3117 clientNode = getClientNode(clientMachine); 3118 } 3119 3120 // choose targets for the new block to be allocated. 3121 return getBlockManager().chooseTarget4NewBlock( 3122 src, replication, clientNode, excludedNodes, blockSize, favoredNodes, 3123 storagePolicyID); 3124 } 3125 3126 /** 3127 * Part II of getAdditionalBlock(). 3128 * Should repeat the same analysis of the file state as in Part 1, 3129 * but under the write lock. 3130 * If the conditions still hold, then allocate a new block with 3131 * the new targets, add it to the INode and to the BlocksMap. 3132 */ 3133 LocatedBlock storeAllocatedBlock(String src, long fileId, String clientName, 3134 ExtendedBlock previous, DatanodeStorageInfo[] targets) throws IOException { 3135 Block newBlock = null; 3136 long offset; 3137 checkOperation(OperationCategory.WRITE); 3138 waitForLoadingFSImage(); 3139 writeLock(); 3140 try { 3141 checkOperation(OperationCategory.WRITE); 3142 // Run the full analysis again, since things could have changed 3143 // while chooseTarget() was executing. 3144 LocatedBlock[] onRetryBlock = new LocatedBlock[1]; 3145 final INodesInPath iip = dir.resolvePath(null, src, fileId); 3146 FileState fileState = 3147 analyzeFileState(iip, fileId, clientName, previous, onRetryBlock); 3148 final INodeFile pendingFile = fileState.inode; 3149 src = fileState.path; 3150 3151 if (onRetryBlock[0] != null) { 3152 if (onRetryBlock[0].getLocations().length > 0) { 3153 // This is a retry. Just return the last block if having locations. 3154 return onRetryBlock[0]; 3155 } else { 3156 // add new chosen targets to already allocated block and return 3157 BlockInfoContiguous lastBlockInFile = pendingFile.getLastBlock(); 3158 ((BlockInfoContiguousUnderConstruction) lastBlockInFile) 3159 .setExpectedLocations(targets); 3160 offset = pendingFile.computeFileSize(); 3161 return makeLocatedBlock(lastBlockInFile, targets, offset); 3162 } 3163 } 3164 3165 // commit the last block and complete it if it has minimum replicas 3166 commitOrCompleteLastBlock(pendingFile, fileState.iip, 3167 ExtendedBlock.getLocalBlock(previous)); 3168 3169 // allocate new block, record block locations in INode. 3170 newBlock = createNewBlock(); 3171 INodesInPath inodesInPath = INodesInPath.fromINode(pendingFile); 3172 saveAllocatedBlock(src, inodesInPath, newBlock, targets); 3173 3174 persistNewBlock(src, pendingFile); 3175 offset = pendingFile.computeFileSize(); 3176 } finally { 3177 writeUnlock("storeAllocatedBlock"); 3178 } 3179 getEditLog().logSync(); 3180 3181 // Return located block 3182 return makeLocatedBlock(newBlock, targets, offset); 3183 } 3184 3185 /* 3186 * Resolve clientmachine address to get a network location path 3187 */ 3188 private Node getClientNode(String clientMachine) { 3189 List<String> hosts = new ArrayList<String>(1); 3190 hosts.add(clientMachine); 3191 List<String> rName = getBlockManager().getDatanodeManager() 3192 .resolveNetworkLocation(hosts); 3193 Node clientNode = null; 3194 if (rName != null) { 3195 // Able to resolve clientMachine mapping. 3196 // Create a temp node to findout the rack local nodes 3197 clientNode = new NodeBase(rName.get(0) + NodeBase.PATH_SEPARATOR_STR 3198 + clientMachine); 3199 } 3200 return clientNode; 3201 } 3202 3203 static class FileState { 3204 public final INodeFile inode; 3205 public final String path; 3206 public final INodesInPath iip; 3207 3208 public FileState(INodeFile inode, String fullPath, INodesInPath iip) { 3209 this.inode = inode; 3210 this.path = fullPath; 3211 this.iip = iip; 3212 } 3213 } 3214 3215 private FileState analyzeFileState( 3216 INodesInPath iip, long fileId, String clientName, 3217 ExtendedBlock previous, LocatedBlock[] onRetryBlock) 3218 throws IOException { 3219 assert hasReadLock(); 3220 String src = iip.getPath(); 3221 checkBlock(previous); 3222 onRetryBlock[0] = null; 3223 checkNameNodeSafeMode("Cannot add block to " + src); 3224 3225 // have we exceeded the configured limit of fs objects. 3226 checkFsObjectLimit(); 3227 3228 Block previousBlock = ExtendedBlock.getLocalBlock(previous); 3229 final INodeFile pendingFile = checkLease(iip, clientName, fileId); 3230 BlockInfoContiguous lastBlockInFile = pendingFile.getLastBlock(); 3231 if (!Block.matchingIdAndGenStamp(previousBlock, lastBlockInFile)) { 3232 // The block that the client claims is the current last block 3233 // doesn't match up with what we think is the last block. There are 3234 // four possibilities: 3235 // 1) This is the first block allocation of an append() pipeline 3236 // which started appending exactly at or exceeding the block boundary. 3237 // In this case, the client isn't passed the previous block, 3238 // so it makes the allocateBlock() call with previous=null. 3239 // We can distinguish this since the last block of the file 3240 // will be exactly a full block. 3241 // 2) This is a retry from a client that missed the response of a 3242 // prior getAdditionalBlock() call, perhaps because of a network 3243 // timeout, or because of an HA failover. In that case, we know 3244 // by the fact that the client is re-issuing the RPC that it 3245 // never began to write to the old block. Hence it is safe to 3246 // to return the existing block. 3247 // 3) This is an entirely bogus request/bug -- we should error out 3248 // rather than potentially appending a new block with an empty 3249 // one in the middle, etc 3250 // 4) This is a retry from a client that timed out while 3251 // the prior getAdditionalBlock() is still being processed, 3252 // currently working on chooseTarget(). 3253 // There are no means to distinguish between the first and 3254 // the second attempts in Part I, because the first one hasn't 3255 // changed the namesystem state yet. 3256 // We run this analysis again in Part II where case 4 is impossible. 3257 3258 BlockInfoContiguous penultimateBlock = pendingFile.getPenultimateBlock(); 3259 if (previous == null && 3260 lastBlockInFile != null && 3261 lastBlockInFile.getNumBytes() >= pendingFile.getPreferredBlockSize() && 3262 lastBlockInFile.isComplete()) { 3263 // Case 1 3264 NameNode.stateChangeLog.debug( 3265 "BLOCK* NameSystem.allocateBlock: handling block allocation" + 3266 " writing to a file with a complete previous block: src={}" + 3267 " lastBlock={}", src, lastBlockInFile); 3268 } else if (Block.matchingIdAndGenStamp(penultimateBlock, previousBlock)) { 3269 if (lastBlockInFile.getNumBytes() != 0) { 3270 throw new IOException( 3271 "Request looked like a retry to allocate block " + 3272 lastBlockInFile + " but it already contains " + 3273 lastBlockInFile.getNumBytes() + " bytes"); 3274 } 3275 3276 // Case 2 3277 // Return the last block. 3278 NameNode.stateChangeLog.info("BLOCK* allocateBlock: " + 3279 "caught retry for allocation of a new block in " + 3280 src + ". Returning previously allocated block " + lastBlockInFile); 3281 long offset = pendingFile.computeFileSize(); 3282 onRetryBlock[0] = makeLocatedBlock(lastBlockInFile, 3283 ((BlockInfoContiguousUnderConstruction)lastBlockInFile).getExpectedStorageLocations(), 3284 offset); 3285 return new FileState(pendingFile, src, iip); 3286 } else { 3287 // Case 3 3288 throw new IOException("Cannot allocate block in " + src + ": " + 3289 "passed 'previous' block " + previous + " does not match actual " + 3290 "last block in file " + lastBlockInFile); 3291 } 3292 } 3293 return new FileState(pendingFile, src, iip); 3294 } 3295 3296 LocatedBlock makeLocatedBlock(Block blk, DatanodeStorageInfo[] locs, 3297 long offset) throws IOException { 3298 LocatedBlock lBlk = new LocatedBlock( 3299 getExtendedBlock(blk), locs, offset, false); 3300 getBlockManager().setBlockToken( 3301 lBlk, BlockTokenSecretManager.AccessMode.WRITE); 3302 return lBlk; 3303 } 3304 3305 /** @see ClientProtocol#getAdditionalDatanode */ 3306 LocatedBlock getAdditionalDatanode(String src, long fileId, 3307 final ExtendedBlock blk, final DatanodeInfo[] existings, 3308 final String[] storageIDs, 3309 final Set<Node> excludes, 3310 final int numAdditionalNodes, final String clientName 3311 ) throws IOException { 3312 //check if the feature is enabled 3313 dtpReplaceDatanodeOnFailure.checkEnabled(); 3314 3315 Node clientnode = null; 3316 String clientMachine; 3317 final long preferredblocksize; 3318 final byte storagePolicyID; 3319 final List<DatanodeStorageInfo> chosen; 3320 checkOperation(OperationCategory.READ); 3321 FSPermissionChecker pc = getPermissionChecker(); 3322 readLock(); 3323 try { 3324 checkOperation(OperationCategory.READ); 3325 //check safe mode 3326 checkNameNodeSafeMode("Cannot add datanode; src=" + src + ", blk=" + blk); 3327 final INodesInPath iip = dir.resolvePath(pc, src, fileId); 3328 src = iip.getPath(); 3329 3330 //check lease 3331 final INodeFile file = checkLease(iip, clientName, fileId); 3332 clientMachine = file.getFileUnderConstructionFeature().getClientMachine(); 3333 clientnode = blockManager.getDatanodeManager().getDatanodeByHost(clientMachine); 3334 preferredblocksize = file.getPreferredBlockSize(); 3335 storagePolicyID = file.getStoragePolicyID(); 3336 3337 //find datanode storages 3338 final DatanodeManager dm = blockManager.getDatanodeManager(); 3339 chosen = Arrays.asList(dm.getDatanodeStorageInfos(existings, storageIDs, 3340 "src=%s, fileId=%d, blk=%s, clientName=%s, clientMachine=%s", 3341 src, fileId, blk, clientName, clientMachine)); 3342 } finally { 3343 readUnlock("getAdditionalDatanode"); 3344 } 3345 3346 if (clientnode == null) { 3347 clientnode = getClientNode(clientMachine); 3348 } 3349 3350 // choose new datanodes. 3351 final DatanodeStorageInfo[] targets = blockManager.chooseTarget4AdditionalDatanode( 3352 src, numAdditionalNodes, clientnode, chosen, 3353 excludes, preferredblocksize, storagePolicyID); 3354 final LocatedBlock lb = new LocatedBlock(blk, targets); 3355 blockManager.setBlockToken(lb, AccessMode.COPY); 3356 return lb; 3357 } 3358 3359 /** 3360 * The client would like to let go of the given block 3361 */ 3362 boolean abandonBlock(ExtendedBlock b, long fileId, String src, String holder) 3363 throws IOException { 3364 NameNode.stateChangeLog.debug( 3365 "BLOCK* NameSystem.abandonBlock: {} of file {}", b, src); 3366 checkOperation(OperationCategory.WRITE); 3367 FSPermissionChecker pc = getPermissionChecker(); 3368 waitForLoadingFSImage(); 3369 writeLock(); 3370 final INodesInPath iip = dir.resolvePath(pc, src, fileId); 3371 src = iip.getPath(); 3372 try { 3373 checkOperation(OperationCategory.WRITE); 3374 checkNameNodeSafeMode("Cannot abandon block " + b + " for file" + src); 3375 final INodeFile file = checkLease(iip, holder, fileId); 3376 3377 // Remove the block from the pending creates list 3378 boolean removed = dir.removeBlock(src, iip, file, 3379 ExtendedBlock.getLocalBlock(b)); 3380 if (!removed) { 3381 return true; 3382 } 3383 NameNode.stateChangeLog.debug("BLOCK* NameSystem.abandonBlock: {} is " + 3384 "removed from pendingCreates", b); 3385 persistBlocks(src, file, false); 3386 } finally { 3387 writeUnlock("abandonBlock"); 3388 } 3389 getEditLog().logSync(); 3390 3391 return true; 3392 } 3393 3394 private INodeFile checkLease(INodesInPath iip, String holder, long fileId) 3395 throws LeaseExpiredException, FileNotFoundException { 3396 String src = iip.getPath(); 3397 INode inode = iip.getLastINode(); 3398 assert hasReadLock(); 3399 final String ident = src + " (inode " + fileId + ")"; 3400 if (inode == null) { 3401 Lease lease = leaseManager.getLease(holder); 3402 throw new LeaseExpiredException( 3403 "No lease on " + ident + ": File does not exist. " 3404 + (lease != null ? lease.toString() 3405 : "Holder " + holder + " does not have any open files.")); 3406 } 3407 if (!inode.isFile()) { 3408 Lease lease = leaseManager.getLease(holder); 3409 throw new LeaseExpiredException( 3410 "No lease on " + ident + ": INode is not a regular file. " 3411 + (lease != null ? lease.toString() 3412 : "Holder " + holder + " does not have any open files.")); 3413 } 3414 final INodeFile file = inode.asFile(); 3415 if (!file.isUnderConstruction()) { 3416 Lease lease = leaseManager.getLease(holder); 3417 throw new LeaseExpiredException( 3418 "No lease on " + ident + ": File is not open for writing. " 3419 + (lease != null ? lease.toString() 3420 : "Holder " + holder + " does not have any open files.")); 3421 } 3422 // No further modification is allowed on a deleted file. 3423 // A file is considered deleted, if it is not in the inodeMap or is marked 3424 // as deleted in the snapshot feature. 3425 if (isFileDeleted(file)) { 3426 throw new FileNotFoundException(src); 3427 } 3428 String clientName = file.getFileUnderConstructionFeature().getClientName(); 3429 if (holder != null && !clientName.equals(holder)) { 3430 throw new LeaseExpiredException("Lease mismatch on " + ident + 3431 " owned by " + clientName + " but is accessed by " + holder); 3432 } 3433 return file; 3434 } 3435 3436 /** 3437 * Complete in-progress write to the given file. 3438 * @return true if successful, false if the client should continue to retry 3439 * (e.g if not all blocks have reached minimum replication yet) 3440 * @throws IOException on error (eg lease mismatch, file not open, file deleted) 3441 */ 3442 boolean completeFile(final String srcArg, String holder, 3443 ExtendedBlock last, long fileId) 3444 throws SafeModeException, UnresolvedLinkException, IOException { 3445 String src = srcArg; 3446 NameNode.stateChangeLog.debug("DIR* NameSystem.completeFile: {} for {}", 3447 src, holder); 3448 checkBlock(last); 3449 boolean success = false; 3450 checkOperation(OperationCategory.WRITE); 3451 waitForLoadingFSImage(); 3452 writeLock(); 3453 try { 3454 checkOperation(OperationCategory.WRITE); 3455 checkNameNodeSafeMode("Cannot complete file " + src); 3456 success = completeFileInternal(src, holder, 3457 ExtendedBlock.getLocalBlock(last), fileId); 3458 } finally { 3459 writeUnlock("completeFile"); 3460 } 3461 getEditLog().logSync(); 3462 if (success) { 3463 NameNode.stateChangeLog.info("DIR* completeFile: " + srcArg 3464 + " is closed by " + holder); 3465 } 3466 return success; 3467 } 3468 3469 private boolean completeFileInternal(String src, String holder, Block last, 3470 long fileId) throws IOException { 3471 assert hasWriteLock(); 3472 final INodeFile pendingFile; 3473 FSPermissionChecker pc = getPermissionChecker(); 3474 final INodesInPath iip = dir.resolvePath(pc, src, fileId); 3475 src = iip.getPath(); 3476 INode inode = null; 3477 try { 3478 inode = iip.getLastINode(); 3479 pendingFile = checkLease(iip, holder, fileId); 3480 } catch (LeaseExpiredException lee) { 3481 if (inode != null && inode.isFile() && 3482 !inode.asFile().isUnderConstruction()) { 3483 // This could be a retry RPC - i.e the client tried to close 3484 // the file, but missed the RPC response. Thus, it is trying 3485 // again to close the file. If the file still exists and 3486 // the client's view of the last block matches the actual 3487 // last block, then we'll treat it as a successful close. 3488 // See HDFS-3031. 3489 final Block realLastBlock = inode.asFile().getLastBlock(); 3490 if (Block.matchingIdAndGenStamp(last, realLastBlock)) { 3491 NameNode.stateChangeLog.info("DIR* completeFile: " + 3492 "request from " + holder + " to complete inode " + fileId + 3493 "(" + src + ") which is already closed. But, it appears to be " + 3494 "an RPC retry. Returning success"); 3495 return true; 3496 } 3497 } 3498 throw lee; 3499 } 3500 // Check the state of the penultimate block. It should be completed 3501 // before attempting to complete the last one. 3502 if (!checkFileProgress(src, pendingFile, false)) { 3503 return false; 3504 } 3505 3506 // commit the last block and complete it if it has minimum replicas 3507 commitOrCompleteLastBlock(pendingFile, iip, last); 3508 3509 if (!checkFileProgress(src, pendingFile, true)) { 3510 return false; 3511 } 3512 3513 finalizeINodeFileUnderConstruction(src, pendingFile, 3514 Snapshot.CURRENT_STATE_ID); 3515 return true; 3516 } 3517 3518 /** 3519 * Save allocated block at the given pending filename 3520 * 3521 * @param src path to the file 3522 * @param inodesInPath representing each of the components of src. 3523 * The last INode is the INode for {@code src} file. 3524 * @param newBlock newly allocated block to be save 3525 * @param targets target datanodes where replicas of the new block is placed 3526 * @throws QuotaExceededException If addition of block exceeds space quota 3527 */ 3528 BlockInfoContiguous saveAllocatedBlock(String src, INodesInPath inodesInPath, 3529 Block newBlock, DatanodeStorageInfo[] targets) 3530 throws IOException { 3531 assert hasWriteLock(); 3532 BlockInfoContiguous b = dir.addBlock(src, inodesInPath, newBlock, targets); 3533 NameNode.stateChangeLog.info("BLOCK* allocate " + b + " for " + src); 3534 DatanodeStorageInfo.incrementBlocksScheduled(targets); 3535 return b; 3536 } 3537 3538 /** 3539 * Create new block with a unique block id and a new generation stamp. 3540 */ 3541 Block createNewBlock() throws IOException { 3542 assert hasWriteLock(); 3543 Block b = new Block(nextBlockId(), 0, 0); 3544 // Increment the generation stamp for every new block. 3545 b.setGenerationStamp(nextGenerationStamp(false)); 3546 return b; 3547 } 3548 3549 /** 3550 * Check that the indicated file's blocks are present and 3551 * replicated. If not, return false. If checkall is true, then check 3552 * all blocks, otherwise check only penultimate block. 3553 */ 3554 boolean checkFileProgress(String src, INodeFile v, boolean checkall) { 3555 if (checkall) { 3556 // check all blocks of the file. 3557 for (BlockInfoContiguous block: v.getBlocks()) { 3558 if (!isCompleteBlock(src, block, blockManager.minReplication)) { 3559 return false; 3560 } 3561 } 3562 } else { 3563 // check the penultimate block of this file 3564 BlockInfoContiguous b = v.getPenultimateBlock(); 3565 if (b != null 3566 && !isCompleteBlock(src, b, blockManager.minReplication)) { 3567 return false; 3568 } 3569 } 3570 return true; 3571 } 3572 3573 private static boolean isCompleteBlock(String src, BlockInfoContiguous b, int minRepl) { 3574 if (!b.isComplete()) { 3575 final BlockInfoContiguousUnderConstruction uc = (BlockInfoContiguousUnderConstruction)b; 3576 final int numNodes = b.numNodes(); 3577 LOG.info("BLOCK* " + b + " is not COMPLETE (ucState = " 3578 + uc.getBlockUCState() + ", replication# = " + numNodes 3579 + (numNodes < minRepl? " < ": " >= ") 3580 + " minimum = " + minRepl + ") in file " + src); 3581 return false; 3582 } 3583 return true; 3584 } 3585 3586 //////////////////////////////////////////////////////////////// 3587 // Here's how to handle block-copy failure during client write: 3588 // -- As usual, the client's write should result in a streaming 3589 // backup write to a k-machine sequence. 3590 // -- If one of the backup machines fails, no worries. Fail silently. 3591 // -- Before client is allowed to close and finalize file, make sure 3592 // that the blocks are backed up. Namenode may have to issue specific backup 3593 // commands to make up for earlier datanode failures. Once all copies 3594 // are made, edit namespace and return to client. 3595 //////////////////////////////////////////////////////////////// 3596 3597 /** 3598 * Change the indicated filename. 3599 * @deprecated Use {@link #renameTo(String, String, boolean, 3600 * Options.Rename...)} instead. 3601 */ 3602 @Deprecated 3603 boolean renameTo(String src, String dst, boolean logRetryCache) 3604 throws IOException { 3605 final String operationName = "rename"; 3606 waitForLoadingFSImage(); 3607 FSDirRenameOp.RenameOldResult ret = null; 3608 writeLock(); 3609 try { 3610 checkOperation(OperationCategory.WRITE); 3611 checkNameNodeSafeMode("Cannot rename " + src); 3612 ret = FSDirRenameOp.renameToInt(dir, src, dst, logRetryCache); 3613 } catch (AccessControlException e) { 3614 logAuditEvent(false, operationName, src, dst, null); 3615 throw e; 3616 } finally { 3617 writeUnlock(operationName); 3618 } 3619 boolean success = ret != null && ret.success; 3620 if (success) { 3621 getEditLog().logSync(); 3622 } 3623 logAuditEvent(success, "rename", src, dst, 3624 ret == null ? null : ret.auditStat); 3625 return success; 3626 } 3627 3628 void renameTo(final String src, final String dst, 3629 boolean logRetryCache, Options.Rename... options) 3630 throws IOException { 3631 final String operationName = "rename"; 3632 waitForLoadingFSImage(); 3633 Map.Entry<BlocksMapUpdateInfo, HdfsFileStatus> res = null; 3634 writeLock(); 3635 try { 3636 checkOperation(OperationCategory.WRITE); 3637 checkNameNodeSafeMode("Cannot rename " + src); 3638 res = FSDirRenameOp.renameToInt(dir, src, dst, logRetryCache, options); 3639 } catch (AccessControlException e) { 3640 logAuditEvent(false, operationName + " (options=" + 3641 Arrays.toString(options) + ")", src, dst, null); 3642 throw e; 3643 } finally { 3644 writeUnlock(operationName); 3645 } 3646 3647 getEditLog().logSync(); 3648 3649 BlocksMapUpdateInfo collectedBlocks = res.getKey(); 3650 HdfsFileStatus auditStat = res.getValue(); 3651 if (!collectedBlocks.getToDeleteList().isEmpty()) { 3652 removeBlocks(collectedBlocks); 3653 collectedBlocks.clear(); 3654 } 3655 3656 logAuditEvent(true, operationName + " (options=" + 3657 Arrays.toString(options) + ")", src, dst, auditStat); 3658 } 3659 3660 /** 3661 * Remove the indicated file from namespace. 3662 * 3663 * @see ClientProtocol#delete(String, boolean) for detailed description and 3664 * description of exceptions 3665 */ 3666 boolean delete(String src, boolean recursive, boolean logRetryCache) 3667 throws IOException { 3668 waitForLoadingFSImage(); 3669 final String operationName = "delete"; 3670 BlocksMapUpdateInfo toRemovedBlocks = null; 3671 writeLock(); 3672 boolean ret = false; 3673 try { 3674 checkOperation(OperationCategory.WRITE); 3675 checkNameNodeSafeMode("Cannot delete " + src); 3676 toRemovedBlocks = FSDirDeleteOp.delete( 3677 this, src, recursive, logRetryCache); 3678 ret = toRemovedBlocks != null; 3679 } catch (AccessControlException e) { 3680 logAuditEvent(false, operationName, src); 3681 throw e; 3682 } finally { 3683 writeUnlock(operationName); 3684 } 3685 getEditLog().logSync(); 3686 if (toRemovedBlocks != null) { 3687 removeBlocks(toRemovedBlocks); // Incremental deletion of blocks 3688 } 3689 logAuditEvent(true, operationName, src); 3690 return ret; 3691 } 3692 3693 FSPermissionChecker getPermissionChecker() 3694 throws AccessControlException { 3695 return dir.getPermissionChecker(); 3696 } 3697 3698 /** 3699 * From the given list, incrementally remove the blocks from blockManager 3700 * Writelock is dropped and reacquired every BLOCK_DELETION_INCREMENT to 3701 * ensure that other waiters on the lock can get in. See HDFS-2938 3702 * 3703 * @param blocks 3704 * An instance of {@link BlocksMapUpdateInfo} which contains a list 3705 * of blocks that need to be removed from blocksMap 3706 */ 3707 void removeBlocks(BlocksMapUpdateInfo blocks) { 3708 List<Block> toDeleteList = blocks.getToDeleteList(); 3709 Iterator<Block> iter = toDeleteList.iterator(); 3710 while (iter.hasNext()) { 3711 writeLock(); 3712 try { 3713 for (int i = 0; i < BLOCK_DELETION_INCREMENT && iter.hasNext(); i++) { 3714 blockManager.removeBlock(iter.next()); 3715 } 3716 } finally { 3717 writeUnlock("removeBlocks"); 3718 } 3719 } 3720 } 3721 3722 /** 3723 * Remove leases and inodes related to a given path 3724 * @param src The given path 3725 * @param removedINodes Containing the list of inodes to be removed from 3726 * inodesMap 3727 * @param acquireINodeMapLock Whether to acquire the lock for inode removal 3728 */ 3729 void removeLeasesAndINodes(String src, List<INode> removedINodes, 3730 final boolean acquireINodeMapLock) { 3731 assert hasWriteLock(); 3732 leaseManager.removeLeaseWithPrefixPath(src); 3733 // remove inodes from inodesMap 3734 if (removedINodes != null) { 3735 if (acquireINodeMapLock) { 3736 dir.writeLock(); 3737 } 3738 try { 3739 dir.removeFromInodeMap(removedINodes); 3740 } finally { 3741 if (acquireINodeMapLock) { 3742 dir.writeUnlock(); 3743 } 3744 } 3745 removedINodes.clear(); 3746 } 3747 } 3748 3749 /** 3750 * Removes the blocks from blocksmap and updates the safemode blocks total 3751 * 3752 * @param blocks 3753 * An instance of {@link BlocksMapUpdateInfo} which contains a list 3754 * of blocks that need to be removed from blocksMap 3755 */ 3756 void removeBlocksAndUpdateSafemodeTotal(BlocksMapUpdateInfo blocks) { 3757 assert hasWriteLock(); 3758 // In the case that we are a Standby tailing edits from the 3759 // active while in safe-mode, we need to track the total number 3760 // of blocks and safe blocks in the system. 3761 boolean trackBlockCounts = isSafeModeTrackingBlocks(); 3762 int numRemovedComplete = 0, numRemovedSafe = 0; 3763 3764 for (Block b : blocks.getToDeleteList()) { 3765 if (trackBlockCounts) { 3766 BlockInfoContiguous bi = getStoredBlock(b); 3767 if (bi.isComplete()) { 3768 numRemovedComplete++; 3769 if (bi.numNodes() >= blockManager.minReplication) { 3770 numRemovedSafe++; 3771 } 3772 } 3773 } 3774 blockManager.removeBlock(b); 3775 } 3776 if (trackBlockCounts) { 3777 if (LOG.isDebugEnabled()) { 3778 LOG.debug("Adjusting safe-mode totals for deletion." 3779 + "decreasing safeBlocks by " + numRemovedSafe 3780 + ", totalBlocks by " + numRemovedComplete); 3781 } 3782 adjustSafeModeBlockTotals(-numRemovedSafe, -numRemovedComplete); 3783 } 3784 } 3785 3786 /** 3787 * @see SafeModeInfo#shouldIncrementallyTrackBlocks 3788 */ 3789 private boolean isSafeModeTrackingBlocks() { 3790 if (!haEnabled) { 3791 // Never track blocks incrementally in non-HA code. 3792 return false; 3793 } 3794 SafeModeInfo sm = this.safeMode; 3795 return sm != null && sm.shouldIncrementallyTrackBlocks(); 3796 } 3797 3798 /** 3799 * Get the file info for a specific file. 3800 * 3801 * @param src The string representation of the path to the file 3802 * @param resolveLink whether to throw UnresolvedLinkException 3803 * if src refers to a symlink 3804 * 3805 * @throws AccessControlException if access is denied 3806 * @throws UnresolvedLinkException if a symlink is encountered. 3807 * 3808 * @return object containing information regarding the file 3809 * or null if file not found 3810 * @throws StandbyException 3811 */ 3812 HdfsFileStatus getFileInfo(final String src, boolean resolveLink) 3813 throws IOException { 3814 final String operationName = "getfileinfo"; 3815 checkOperation(OperationCategory.READ); 3816 HdfsFileStatus stat = null; 3817 readLock(); 3818 try { 3819 checkOperation(OperationCategory.READ); 3820 stat = FSDirStatAndListingOp.getFileInfo(dir, src, resolveLink); 3821 } catch (AccessControlException e) { 3822 logAuditEvent(false, operationName, src); 3823 throw e; 3824 } finally { 3825 readUnlock(operationName); 3826 } 3827 logAuditEvent(true, operationName, src); 3828 return stat; 3829 } 3830 3831 /** 3832 * Returns true if the file is closed 3833 */ 3834 boolean isFileClosed(final String src) throws IOException { 3835 final String operationName = "isFileClosed"; 3836 checkOperation(OperationCategory.READ); 3837 readLock(); 3838 try { 3839 checkOperation(OperationCategory.READ); 3840 return FSDirStatAndListingOp.isFileClosed(dir, src); 3841 } catch (AccessControlException e) { 3842 logAuditEvent(false, operationName, src); 3843 throw e; 3844 } finally { 3845 readUnlock(operationName); 3846 } 3847 } 3848 3849 /** 3850 * Create all the necessary directories 3851 */ 3852 boolean mkdirs(String src, PermissionStatus permissions, 3853 boolean createParent) throws IOException { 3854 final String operationName = "mkdirs"; 3855 HdfsFileStatus auditStat = null; 3856 checkOperation(OperationCategory.WRITE); 3857 writeLock(); 3858 try { 3859 checkOperation(OperationCategory.WRITE); 3860 checkNameNodeSafeMode("Cannot create directory " + src); 3861 auditStat = FSDirMkdirOp.mkdirs(this, src, permissions, createParent); 3862 } catch (AccessControlException e) { 3863 logAuditEvent(false, operationName, src); 3864 throw e; 3865 } finally { 3866 writeUnlock(operationName); 3867 } 3868 getEditLog().logSync(); 3869 logAuditEvent(true, operationName, src, null, auditStat); 3870 return true; 3871 } 3872 3873 /** 3874 * Get the content summary for a specific file/dir. 3875 * 3876 * @param src The string representation of the path to the file 3877 * 3878 * @throws AccessControlException if access is denied 3879 * @throws UnresolvedLinkException if a symlink is encountered. 3880 * @throws FileNotFoundException if no file exists 3881 * @throws StandbyException 3882 * @throws IOException for issues with writing to the audit log 3883 * 3884 * @return object containing information regarding the file 3885 * or null if file not found 3886 */ 3887 ContentSummary getContentSummary(final String src) throws IOException { 3888 checkOperation(OperationCategory.READ); 3889 final String operationName = "contentSummary"; 3890 readLock(); 3891 boolean success = true; 3892 try { 3893 checkOperation(OperationCategory.READ); 3894 return FSDirStatAndListingOp.getContentSummary(dir, src); 3895 } catch (AccessControlException ace) { 3896 success = false; 3897 throw ace; 3898 } finally { 3899 readUnlock(operationName); 3900 logAuditEvent(success, operationName, src); 3901 } 3902 } 3903 3904 /** 3905 * Set the namespace quota and storage space quota for a directory. 3906 * See {@link ClientProtocol#setQuota(String, long, long, StorageType)} for the 3907 * contract. 3908 * 3909 * Note: This does not support ".inodes" relative path. 3910 */ 3911 void setQuota(String src, long nsQuota, long ssQuota, StorageType type) 3912 throws IOException { 3913 checkOperation(OperationCategory.WRITE); 3914 final String operationName = "setQuota"; 3915 writeLock(); 3916 boolean success = false; 3917 try { 3918 checkOperation(OperationCategory.WRITE); 3919 checkNameNodeSafeMode("Cannot set quota on " + src); 3920 FSDirAttrOp.setQuota(dir, src, nsQuota, ssQuota, type); 3921 success = true; 3922 } finally { 3923 writeUnlock(operationName); 3924 if (success) { 3925 getEditLog().logSync(); 3926 } 3927 logAuditEvent(success, operationName, src); 3928 } 3929 } 3930 3931 /** Persist all metadata about this file. 3932 * @param src The string representation of the path 3933 * @param fileId The inode ID that we're fsyncing. Older clients will pass 3934 * INodeId.GRANDFATHER_INODE_ID here. 3935 * @param clientName The string representation of the client 3936 * @param lastBlockLength The length of the last block 3937 * under construction reported from client. 3938 * @throws IOException if path does not exist 3939 */ 3940 void fsync(String src, long fileId, String clientName, long lastBlockLength) 3941 throws IOException { 3942 NameNode.stateChangeLog.info("BLOCK* fsync: " + src + " for " + clientName); 3943 checkOperation(OperationCategory.WRITE); 3944 3945 FSPermissionChecker pc = getPermissionChecker(); 3946 waitForLoadingFSImage(); 3947 writeLock(); 3948 try { 3949 checkOperation(OperationCategory.WRITE); 3950 checkNameNodeSafeMode("Cannot fsync file " + src); 3951 INodesInPath iip = dir.resolvePath(pc, src, fileId); 3952 src = iip.getPath(); 3953 final INodeFile pendingFile = checkLease(iip, clientName, fileId); 3954 if (lastBlockLength > 0) { 3955 pendingFile.getFileUnderConstructionFeature().updateLengthOfLastBlock( 3956 pendingFile, lastBlockLength); 3957 } 3958 persistBlocks(src, pendingFile, false); 3959 } finally { 3960 writeUnlock("fsync"); 3961 } 3962 getEditLog().logSync(); 3963 } 3964 3965 /** 3966 * Move a file that is being written to be immutable. 3967 * @param src The filename 3968 * @param lease The lease for the client creating the file 3969 * @param recoveryLeaseHolder reassign lease to this holder if the last block 3970 * needs recovery; keep current holder if null. 3971 * @throws AlreadyBeingCreatedException if file is waiting to achieve minimal 3972 * replication;<br> 3973 * RecoveryInProgressException if lease recovery is in progress.<br> 3974 * IOException in case of an error. 3975 * @return true if file has been successfully finalized and closed or 3976 * false if block recovery has been initiated. Since the lease owner 3977 * has been changed and logged, caller should call logSync(). 3978 */ 3979 boolean internalReleaseLease(Lease lease, String src, INodesInPath iip, 3980 String recoveryLeaseHolder) throws IOException { 3981 LOG.info("Recovering " + lease + ", src=" + src); 3982 assert !isInSafeMode(); 3983 assert hasWriteLock(); 3984 3985 final INodeFile pendingFile = iip.getLastINode().asFile(); 3986 int nrBlocks = pendingFile.numBlocks(); 3987 BlockInfoContiguous[] blocks = pendingFile.getBlocks(); 3988 3989 int nrCompleteBlocks; 3990 BlockInfoContiguous curBlock = null; 3991 for(nrCompleteBlocks = 0; nrCompleteBlocks < nrBlocks; nrCompleteBlocks++) { 3992 curBlock = blocks[nrCompleteBlocks]; 3993 if(!curBlock.isComplete()) 3994 break; 3995 assert blockManager.checkMinReplication(curBlock) : 3996 "A COMPLETE block is not minimally replicated in " + src; 3997 } 3998 3999 // If there are no incomplete blocks associated with this file, 4000 // then reap lease immediately and close the file. 4001 if(nrCompleteBlocks == nrBlocks) { 4002 finalizeINodeFileUnderConstruction(src, pendingFile, 4003 iip.getLatestSnapshotId()); 4004 NameNode.stateChangeLog.warn("BLOCK*" 4005 + " internalReleaseLease: All existing blocks are COMPLETE," 4006 + " lease removed, file closed."); 4007 return true; // closed! 4008 } 4009 4010 // Only the last and the penultimate blocks may be in non COMPLETE state. 4011 // If the penultimate block is not COMPLETE, then it must be COMMITTED. 4012 if(nrCompleteBlocks < nrBlocks - 2 || 4013 nrCompleteBlocks == nrBlocks - 2 && 4014 curBlock != null && 4015 curBlock.getBlockUCState() != BlockUCState.COMMITTED) { 4016 final String message = "DIR* NameSystem.internalReleaseLease: " 4017 + "attempt to release a create lock on " 4018 + src + " but file is already closed."; 4019 NameNode.stateChangeLog.warn(message); 4020 throw new IOException(message); 4021 } 4022 4023 // The last block is not COMPLETE, and 4024 // that the penultimate block if exists is either COMPLETE or COMMITTED 4025 final BlockInfoContiguous lastBlock = pendingFile.getLastBlock(); 4026 BlockUCState lastBlockState = lastBlock.getBlockUCState(); 4027 BlockInfoContiguous penultimateBlock = pendingFile.getPenultimateBlock(); 4028 4029 // If penultimate block doesn't exist then its minReplication is met 4030 boolean penultimateBlockMinReplication = penultimateBlock == null ? true : 4031 blockManager.checkMinReplication(penultimateBlock); 4032 4033 switch(lastBlockState) { 4034 case COMPLETE: 4035 assert false : "Already checked that the last block is incomplete"; 4036 break; 4037 case COMMITTED: 4038 // Close file if committed blocks are minimally replicated 4039 if(penultimateBlockMinReplication && 4040 blockManager.checkMinReplication(lastBlock)) { 4041 finalizeINodeFileUnderConstruction(src, pendingFile, 4042 iip.getLatestSnapshotId()); 4043 NameNode.stateChangeLog.warn("BLOCK*" 4044 + " internalReleaseLease: Committed blocks are minimally replicated," 4045 + " lease removed, file closed."); 4046 return true; // closed! 4047 } 4048 // Cannot close file right now, since some blocks 4049 // are not yet minimally replicated. 4050 // This may potentially cause infinite loop in lease recovery 4051 // if there are no valid replicas on data-nodes. 4052 String message = "DIR* NameSystem.internalReleaseLease: " + 4053 "Failed to release lease for file " + src + 4054 ". Committed blocks are waiting to be minimally replicated." + 4055 " Try again later."; 4056 NameNode.stateChangeLog.warn(message); 4057 throw new AlreadyBeingCreatedException(message); 4058 case UNDER_CONSTRUCTION: 4059 case UNDER_RECOVERY: 4060 final BlockInfoContiguousUnderConstruction uc = (BlockInfoContiguousUnderConstruction)lastBlock; 4061 // determine if last block was intended to be truncated 4062 Block recoveryBlock = uc.getTruncateBlock(); 4063 boolean truncateRecovery = recoveryBlock != null; 4064 boolean copyOnTruncate = truncateRecovery && 4065 recoveryBlock.getBlockId() != uc.getBlockId(); 4066 assert !copyOnTruncate || 4067 recoveryBlock.getBlockId() < uc.getBlockId() && 4068 recoveryBlock.getGenerationStamp() < uc.getGenerationStamp() && 4069 recoveryBlock.getNumBytes() > uc.getNumBytes() : 4070 "wrong recoveryBlock"; 4071 4072 // setup the last block locations from the blockManager if not known 4073 if (uc.getNumExpectedLocations() == 0) { 4074 uc.setExpectedLocations(blockManager.getStorages(lastBlock)); 4075 } 4076 4077 if (uc.getNumExpectedLocations() == 0 && uc.getNumBytes() == 0) { 4078 // There is no datanode reported to this block. 4079 // may be client have crashed before writing data to pipeline. 4080 // This blocks doesn't need any recovery. 4081 // We can remove this block and close the file. 4082 pendingFile.removeLastBlock(lastBlock); 4083 finalizeINodeFileUnderConstruction(src, pendingFile, 4084 iip.getLatestSnapshotId()); 4085 NameNode.stateChangeLog.warn("BLOCK* internalReleaseLease: " 4086 + "Removed empty last block and closed file."); 4087 return true; 4088 } 4089 // start recovery of the last block for this file 4090 long blockRecoveryId = nextGenerationStamp(blockIdManager.isLegacyBlock(uc)); 4091 lease = reassignLease(lease, src, recoveryLeaseHolder, pendingFile); 4092 if(copyOnTruncate) { 4093 uc.setGenerationStamp(blockRecoveryId); 4094 } else if(truncateRecovery) { 4095 recoveryBlock.setGenerationStamp(blockRecoveryId); 4096 } 4097 uc.initializeBlockRecovery(blockRecoveryId); 4098 leaseManager.renewLease(lease); 4099 // Cannot close file right now, since the last block requires recovery. 4100 // This may potentially cause infinite loop in lease recovery 4101 // if there are no valid replicas on data-nodes. 4102 NameNode.stateChangeLog.warn( 4103 "DIR* NameSystem.internalReleaseLease: " + 4104 "File " + src + " has not been closed." + 4105 " Lease recovery is in progress. " + 4106 "RecoveryId = " + blockRecoveryId + " for block " + lastBlock); 4107 break; 4108 } 4109 return false; 4110 } 4111 4112 private Lease reassignLease(Lease lease, String src, String newHolder, 4113 INodeFile pendingFile) { 4114 assert hasWriteLock(); 4115 if(newHolder == null) 4116 return lease; 4117 // The following transaction is not synced. Make sure it's sync'ed later. 4118 logReassignLease(lease.getHolder(), src, newHolder); 4119 return reassignLeaseInternal(lease, src, newHolder, pendingFile); 4120 } 4121 4122 Lease reassignLeaseInternal(Lease lease, String src, String newHolder, 4123 INodeFile pendingFile) { 4124 assert hasWriteLock(); 4125 pendingFile.getFileUnderConstructionFeature().setClientName(newHolder); 4126 return leaseManager.reassignLease(lease, src, newHolder); 4127 } 4128 4129 private void commitOrCompleteLastBlock(final INodeFile fileINode, 4130 final INodesInPath iip, final Block commitBlock) throws IOException { 4131 assert hasWriteLock(); 4132 Preconditions.checkArgument(fileINode.isUnderConstruction()); 4133 blockManager.commitOrCompleteLastBlock(fileINode, commitBlock, iip); 4134 } 4135 4136 private void finalizeINodeFileUnderConstruction(String src, 4137 INodeFile pendingFile, int latestSnapshot) throws IOException { 4138 assert hasWriteLock(); 4139 4140 FileUnderConstructionFeature uc = pendingFile.getFileUnderConstructionFeature(); 4141 if (uc == null) { 4142 throw new IOException("Cannot finalize file " + src 4143 + " because it is not under construction"); 4144 } 4145 4146 pendingFile.recordModification(latestSnapshot); 4147 4148 // The file is no longer pending. 4149 // Create permanent INode, update blocks. No need to replace the inode here 4150 // since we just remove the uc feature from pendingFile 4151 pendingFile.toCompleteFile(now()); 4152 4153 leaseManager.removeLease(uc.getClientName(), src); 4154 4155 waitForLoadingFSImage(); 4156 // close file and persist block allocations for this file 4157 closeFile(src, pendingFile); 4158 4159 blockManager.checkReplication(pendingFile); 4160 } 4161 4162 @VisibleForTesting 4163 BlockInfoContiguous getStoredBlock(Block block) { 4164 return blockManager.getStoredBlock(block); 4165 } 4166 4167 @Override 4168 public boolean isInSnapshot(BlockInfoContiguousUnderConstruction blockUC) { 4169 assert hasReadLock(); 4170 final BlockCollection bc = blockUC.getBlockCollection(); 4171 if (bc == null || !(bc instanceof INodeFile) 4172 || !bc.isUnderConstruction()) { 4173 return false; 4174 } 4175 4176 String fullName = bc.getName(); 4177 try { 4178 if (fullName != null && fullName.startsWith(Path.SEPARATOR) 4179 && dir.getINode(fullName) == bc) { 4180 // If file exists in normal path then no need to look in snapshot 4181 return false; 4182 } 4183 } catch (UnresolvedLinkException e) { 4184 LOG.error("Error while resolving the link : " + fullName, e); 4185 return false; 4186 } 4187 /* 4188 * 1. if bc is under construction and also with snapshot, and 4189 * bc is not in the current fsdirectory tree, bc must represent a snapshot 4190 * file. 4191 * 2. if fullName is not an absolute path, bc cannot be existent in the 4192 * current fsdirectory tree. 4193 * 3. if bc is not the current node associated with fullName, bc must be a 4194 * snapshot inode. 4195 */ 4196 return true; 4197 } 4198 4199 void commitBlockSynchronization(ExtendedBlock oldBlock, 4200 long newgenerationstamp, long newlength, 4201 boolean closeFile, boolean deleteblock, DatanodeID[] newtargets, 4202 String[] newtargetstorages) throws IOException { 4203 LOG.info("commitBlockSynchronization(oldBlock=" + oldBlock 4204 + ", newgenerationstamp=" + newgenerationstamp 4205 + ", newlength=" + newlength 4206 + ", newtargets=" + Arrays.asList(newtargets) 4207 + ", closeFile=" + closeFile 4208 + ", deleteBlock=" + deleteblock 4209 + ")"); 4210 checkOperation(OperationCategory.WRITE); 4211 final String src; 4212 waitForLoadingFSImage(); 4213 writeLock(); 4214 try { 4215 checkOperation(OperationCategory.WRITE); 4216 // If a DN tries to commit to the standby, the recovery will 4217 // fail, and the next retry will succeed on the new NN. 4218 4219 checkNameNodeSafeMode( 4220 "Cannot commitBlockSynchronization while in safe mode"); 4221 final BlockInfoContiguous storedBlock = getStoredBlock( 4222 ExtendedBlock.getLocalBlock(oldBlock)); 4223 if (storedBlock == null) { 4224 if (deleteblock) { 4225 // This may be a retry attempt so ignore the failure 4226 // to locate the block. 4227 if (LOG.isDebugEnabled()) { 4228 LOG.debug("Block (=" + oldBlock + ") not found"); 4229 } 4230 return; 4231 } else { 4232 throw new IOException("Block (=" + oldBlock + ") not found"); 4233 } 4234 } 4235 final long oldGenerationStamp = storedBlock.getGenerationStamp(); 4236 final long oldNumBytes = storedBlock.getNumBytes(); 4237 // 4238 // The implementation of delete operation (see @deleteInternal method) 4239 // first removes the file paths from namespace, and delays the removal 4240 // of blocks to later time for better performance. When 4241 // commitBlockSynchronization (this method) is called in between, the 4242 // blockCollection of storedBlock could have been assigned to null by 4243 // the delete operation, throw IOException here instead of NPE; if the 4244 // file path is already removed from namespace by the delete operation, 4245 // throw FileNotFoundException here, so not to proceed to the end of 4246 // this method to add a CloseOp to the edit log for an already deleted 4247 // file (See HDFS-6825). 4248 // 4249 BlockCollection blockCollection = storedBlock.getBlockCollection(); 4250 if (blockCollection == null) { 4251 throw new IOException("The blockCollection of " + storedBlock 4252 + " is null, likely because the file owning this block was" 4253 + " deleted and the block removal is delayed"); 4254 } 4255 INodeFile iFile = ((INode)blockCollection).asFile(); 4256 src = iFile.getFullPathName(); 4257 if (isFileDeleted(iFile)) { 4258 throw new FileNotFoundException("File not found: " 4259 + src + ", likely due to delayed block removal"); 4260 } 4261 if ((!iFile.isUnderConstruction() || storedBlock.isComplete()) && 4262 iFile.getLastBlock().isComplete()) { 4263 if (LOG.isDebugEnabled()) { 4264 LOG.debug("Unexpected block (=" + oldBlock 4265 + ") since the file (=" + iFile.getLocalName() 4266 + ") is not under construction"); 4267 } 4268 return; 4269 } 4270 4271 BlockInfoContiguousUnderConstruction truncatedBlock = 4272 (BlockInfoContiguousUnderConstruction) iFile.getLastBlock(); 4273 long recoveryId = truncatedBlock.getBlockRecoveryId(); 4274 boolean copyTruncate = 4275 truncatedBlock.getBlockId() != storedBlock.getBlockId(); 4276 if(recoveryId != newgenerationstamp) { 4277 throw new IOException("The recovery id " + newgenerationstamp 4278 + " does not match current recovery id " 4279 + recoveryId + " for block " + oldBlock); 4280 } 4281 4282 if (deleteblock) { 4283 Block blockToDel = ExtendedBlock.getLocalBlock(oldBlock); 4284 boolean remove = iFile.removeLastBlock(blockToDel); 4285 if (remove) { 4286 blockManager.removeBlock(storedBlock); 4287 } 4288 } 4289 else { 4290 // update last block 4291 if(!copyTruncate) { 4292 storedBlock.setGenerationStamp(newgenerationstamp); 4293 storedBlock.setNumBytes(newlength); 4294 } 4295 4296 // find the DatanodeDescriptor objects 4297 ArrayList<DatanodeDescriptor> trimmedTargets = 4298 new ArrayList<DatanodeDescriptor>(newtargets.length); 4299 ArrayList<String> trimmedStorages = 4300 new ArrayList<String>(newtargets.length); 4301 if (newtargets.length > 0) { 4302 for (int i = 0; i < newtargets.length; ++i) { 4303 // try to get targetNode 4304 DatanodeDescriptor targetNode = 4305 blockManager.getDatanodeManager().getDatanode(newtargets[i]); 4306 if (targetNode != null) { 4307 trimmedTargets.add(targetNode); 4308 trimmedStorages.add(newtargetstorages[i]); 4309 } else if (LOG.isDebugEnabled()) { 4310 LOG.debug("DatanodeDescriptor (=" + newtargets[i] + ") not found"); 4311 } 4312 } 4313 } 4314 if ((closeFile) && !trimmedTargets.isEmpty()) { 4315 // the file is getting closed. Insert block locations into blockManager. 4316 // Otherwise fsck will report these blocks as MISSING, especially if the 4317 // blocksReceived from Datanodes take a long time to arrive. 4318 for (int i = 0; i < trimmedTargets.size(); i++) { 4319 DatanodeStorageInfo storageInfo = 4320 trimmedTargets.get(i).getStorageInfo(trimmedStorages.get(i)); 4321 if (storageInfo != null) { 4322 if(copyTruncate) { 4323 storageInfo.addBlock(truncatedBlock); 4324 } else { 4325 storageInfo.addBlock(storedBlock); 4326 } 4327 } 4328 } 4329 } 4330 4331 // add pipeline locations into the INodeUnderConstruction 4332 DatanodeStorageInfo[] trimmedStorageInfos = 4333 blockManager.getDatanodeManager().getDatanodeStorageInfos( 4334 trimmedTargets.toArray(new DatanodeID[trimmedTargets.size()]), 4335 trimmedStorages.toArray(new String[trimmedStorages.size()]), 4336 "src=%s, oldBlock=%s, newgenerationstamp=%d, newlength=%d", 4337 src, oldBlock, newgenerationstamp, newlength); 4338 4339 if(copyTruncate) { 4340 iFile.setLastBlock(truncatedBlock, trimmedStorageInfos); 4341 } else { 4342 iFile.setLastBlock(storedBlock, trimmedStorageInfos); 4343 if (closeFile) { 4344 blockManager.markBlockReplicasAsCorrupt(storedBlock, 4345 oldGenerationStamp, oldNumBytes, trimmedStorageInfos); 4346 } 4347 } 4348 } 4349 4350 if (closeFile) { 4351 if(copyTruncate) { 4352 closeFileCommitBlocks(src, iFile, truncatedBlock); 4353 if(!iFile.isBlockInLatestSnapshot(storedBlock)) { 4354 blockManager.removeBlock(storedBlock); 4355 } 4356 } else { 4357 closeFileCommitBlocks(src, iFile, storedBlock); 4358 } 4359 } else { 4360 // If this commit does not want to close the file, persist blocks 4361 persistBlocks(src, iFile, false); 4362 } 4363 } finally { 4364 writeUnlock("commitBlockSynchronization"); 4365 } 4366 getEditLog().logSync(); 4367 if (closeFile) { 4368 LOG.info("commitBlockSynchronization(oldBlock=" + oldBlock 4369 + ", file=" + src 4370 + ", newgenerationstamp=" + newgenerationstamp 4371 + ", newlength=" + newlength 4372 + ", newtargets=" + Arrays.asList(newtargets) + ") successful"); 4373 } else { 4374 LOG.info("commitBlockSynchronization(" + oldBlock + ") successful"); 4375 } 4376 } 4377 4378 /** 4379 * @param pendingFile open file that needs to be closed 4380 * @param storedBlock last block 4381 * @throws IOException on error 4382 */ 4383 @VisibleForTesting 4384 void closeFileCommitBlocks(String src, INodeFile pendingFile, 4385 BlockInfoContiguous storedBlock) throws IOException { 4386 final INodesInPath iip = INodesInPath.fromINode(pendingFile); 4387 4388 // commit the last block and complete it if it has minimum replicas 4389 commitOrCompleteLastBlock(pendingFile, iip, storedBlock); 4390 4391 //remove lease, close file 4392 finalizeINodeFileUnderConstruction(src, pendingFile, 4393 Snapshot.findLatestSnapshot(pendingFile, Snapshot.CURRENT_STATE_ID)); 4394 } 4395 4396 /** 4397 * Renew the lease(s) held by the given client 4398 */ 4399 void renewLease(String holder) throws IOException { 4400 checkOperation(OperationCategory.WRITE); 4401 readLock(); 4402 try { 4403 checkOperation(OperationCategory.WRITE); 4404 checkNameNodeSafeMode("Cannot renew lease for " + holder); 4405 leaseManager.renewLease(holder); 4406 } finally { 4407 readUnlock("renewLease"); 4408 } 4409 } 4410 4411 /** 4412 * Get a partial listing of the indicated directory 4413 * 4414 * @param src the directory name 4415 * @param startAfter the name to start after 4416 * @param needLocation if blockLocations need to be returned 4417 * @return a partial listing starting after startAfter 4418 * 4419 * @throws AccessControlException if access is denied 4420 * @throws UnresolvedLinkException if symbolic link is encountered 4421 * @throws IOException if other I/O error occurred 4422 */ 4423 DirectoryListing getListing(String src, byte[] startAfter, 4424 boolean needLocation) 4425 throws IOException { 4426 checkOperation(OperationCategory.READ); 4427 final String operationName = "listStatus"; 4428 DirectoryListing dl = null; 4429 readLock(); 4430 try { 4431 checkOperation(NameNode.OperationCategory.READ); 4432 dl = FSDirStatAndListingOp.getListingInt(dir, src, startAfter, 4433 needLocation); 4434 } catch (AccessControlException e) { 4435 logAuditEvent(false, operationName, src); 4436 throw e; 4437 } finally { 4438 readUnlock(operationName); 4439 } 4440 logAuditEvent(true, operationName, src); 4441 return dl; 4442 } 4443 4444 ///////////////////////////////////////////////////////// 4445 // 4446 // These methods are called by datanodes 4447 // 4448 ///////////////////////////////////////////////////////// 4449 /** 4450 * Register Datanode. 4451 * <p> 4452 * The purpose of registration is to identify whether the new datanode 4453 * serves a new data storage, and will report new data block copies, 4454 * which the namenode was not aware of; or the datanode is a replacement 4455 * node for the data storage that was previously served by a different 4456 * or the same (in terms of host:port) datanode. 4457 * The data storages are distinguished by their storageIDs. When a new 4458 * data storage is reported the namenode issues a new unique storageID. 4459 * <p> 4460 * Finally, the namenode returns its namespaceID as the registrationID 4461 * for the datanodes. 4462 * namespaceID is a persistent attribute of the name space. 4463 * The registrationID is checked every time the datanode is communicating 4464 * with the namenode. 4465 * Datanodes with inappropriate registrationID are rejected. 4466 * If the namenode stops, and then restarts it can restore its 4467 * namespaceID and will continue serving the datanodes that has previously 4468 * registered with the namenode without restarting the whole cluster. 4469 * 4470 * @see org.apache.hadoop.hdfs.server.datanode.DataNode 4471 */ 4472 void registerDatanode(DatanodeRegistration nodeReg) throws IOException { 4473 writeLock(); 4474 try { 4475 getBlockManager().getDatanodeManager().registerDatanode(nodeReg); 4476 checkSafeMode(); 4477 } finally { 4478 writeUnlock("registerDatanode"); 4479 } 4480 } 4481 4482 /** 4483 * Get registrationID for datanodes based on the namespaceID. 4484 * 4485 * @see #registerDatanode(DatanodeRegistration) 4486 * @return registration ID 4487 */ 4488 String getRegistrationID() { 4489 return Storage.getRegistrationID(getFSImage().getStorage()); 4490 } 4491 4492 /** 4493 * The given node has reported in. This method should: 4494 * 1) Record the heartbeat, so the datanode isn't timed out 4495 * 2) Adjust usage stats for future block allocation 4496 * 4497 * If a substantial amount of time passed since the last datanode 4498 * heartbeat then request an immediate block report. 4499 * 4500 * @return an array of datanode commands 4501 * @throws IOException 4502 */ 4503 HeartbeatResponse handleHeartbeat(DatanodeRegistration nodeReg, 4504 StorageReport[] reports, long cacheCapacity, long cacheUsed, 4505 int xceiverCount, int xmitsInProgress, int failedVolumes, 4506 VolumeFailureSummary volumeFailureSummary) throws IOException { 4507 readLock(); 4508 try { 4509 //get datanode commands 4510 final int maxTransfer = blockManager.getMaxReplicationStreams() 4511 - xmitsInProgress; 4512 DatanodeCommand[] cmds = blockManager.getDatanodeManager().handleHeartbeat( 4513 nodeReg, reports, blockPoolId, cacheCapacity, cacheUsed, 4514 xceiverCount, maxTransfer, failedVolumes, volumeFailureSummary); 4515 4516 //create ha status 4517 final NNHAStatusHeartbeat haState = new NNHAStatusHeartbeat( 4518 haContext.getState().getServiceState(), 4519 getFSImage().getCorrectLastAppliedOrWrittenTxId()); 4520 4521 return new HeartbeatResponse(cmds, haState, rollingUpgradeInfo); 4522 } finally { 4523 readUnlock("handleHeartbeat"); 4524 } 4525 } 4526 4527 /** 4528 * Returns whether or not there were available resources at the last check of 4529 * resources. 4530 * 4531 * @return true if there were sufficient resources available, false otherwise. 4532 */ 4533 boolean nameNodeHasResourcesAvailable() { 4534 return hasResourcesAvailable; 4535 } 4536 4537 /** 4538 * Perform resource checks and cache the results. 4539 */ 4540 void checkAvailableResources() { 4541 Preconditions.checkState(nnResourceChecker != null, 4542 "nnResourceChecker not initialized"); 4543 hasResourcesAvailable = nnResourceChecker.hasAvailableDiskSpace(); 4544 } 4545 4546 /** 4547 * Persist the block list for the inode. 4548 * @param path 4549 * @param file 4550 * @param logRetryCache 4551 */ 4552 private void persistBlocks(String path, INodeFile file, 4553 boolean logRetryCache) { 4554 assert hasWriteLock(); 4555 Preconditions.checkArgument(file.isUnderConstruction()); 4556 getEditLog().logUpdateBlocks(path, file, logRetryCache); 4557 NameNode.stateChangeLog.debug("persistBlocks: {} with {} blocks is" + 4558 " peristed to the file system", path, file.getBlocks().length); 4559 } 4560 4561 /** 4562 * Close file. 4563 * @param path 4564 * @param file 4565 */ 4566 private void closeFile(String path, INodeFile file) { 4567 assert hasWriteLock(); 4568 waitForLoadingFSImage(); 4569 // file is closed 4570 getEditLog().logCloseFile(path, file); 4571 NameNode.stateChangeLog.debug("closeFile: {} with {} blocks is persisted" + 4572 " to the file system", path, file.getBlocks().length); 4573 } 4574 4575 /** 4576 * Periodically calls hasAvailableResources of NameNodeResourceChecker, and if 4577 * there are found to be insufficient resources available, causes the NN to 4578 * enter safe mode. If resources are later found to have returned to 4579 * acceptable levels, this daemon will cause the NN to exit safe mode. 4580 */ 4581 class NameNodeResourceMonitor implements Runnable { 4582 boolean shouldNNRmRun = true; 4583 @Override 4584 public void run () { 4585 try { 4586 while (fsRunning && shouldNNRmRun) { 4587 checkAvailableResources(); 4588 if(!nameNodeHasResourcesAvailable()) { 4589 String lowResourcesMsg = "NameNode low on available disk space. "; 4590 if (!isInSafeMode()) { 4591 LOG.warn(lowResourcesMsg + "Entering safe mode."); 4592 } else { 4593 LOG.warn(lowResourcesMsg + "Already in safe mode."); 4594 } 4595 enterSafeMode(true); 4596 } 4597 try { 4598 Thread.sleep(resourceRecheckInterval); 4599 } catch (InterruptedException ie) { 4600 // Deliberately ignore 4601 } 4602 } 4603 } catch (Exception e) { 4604 FSNamesystem.LOG.error("Exception in NameNodeResourceMonitor: ", e); 4605 } 4606 } 4607 4608 public void stopMonitor() { 4609 shouldNNRmRun = false; 4610 } 4611 } 4612 4613 class NameNodeEditLogRoller implements Runnable { 4614 4615 private boolean shouldRun = true; 4616 private final long rollThreshold; 4617 private final long sleepIntervalMs; 4618 4619 public NameNodeEditLogRoller(long rollThreshold, int sleepIntervalMs) { 4620 this.rollThreshold = rollThreshold; 4621 this.sleepIntervalMs = sleepIntervalMs; 4622 } 4623 4624 @Override 4625 public void run() { 4626 while (fsRunning && shouldRun) { 4627 try { 4628 FSEditLog editLog = getFSImage().getEditLog(); 4629 long numEdits = 4630 editLog.getLastWrittenTxId() - editLog.getCurSegmentTxId(); 4631 if (numEdits > rollThreshold) { 4632 FSNamesystem.LOG.info("NameNode rolling its own edit log because" 4633 + " number of edits in open segment exceeds threshold of " 4634 + rollThreshold); 4635 rollEditLog(); 4636 } 4637 } catch (Exception e) { 4638 FSNamesystem.LOG.error("Swallowing exception in " 4639 + NameNodeEditLogRoller.class.getSimpleName() + ":", e); 4640 } 4641 try { 4642 Thread.sleep(sleepIntervalMs); 4643 } catch (InterruptedException e) { 4644 FSNamesystem.LOG.info(NameNodeEditLogRoller.class.getSimpleName() 4645 + " was interrupted, exiting"); 4646 break; 4647 } 4648 } 4649 } 4650 4651 public void stop() { 4652 shouldRun = false; 4653 } 4654 } 4655 4656 /** 4657 * Daemon to periodically scan the namespace for lazyPersist files 4658 * with missing blocks and unlink them. 4659 */ 4660 class LazyPersistFileScrubber implements Runnable { 4661 private volatile boolean shouldRun = true; 4662 final int scrubIntervalSec; 4663 public LazyPersistFileScrubber(final int scrubIntervalSec) { 4664 this.scrubIntervalSec = scrubIntervalSec; 4665 } 4666 4667 /** 4668 * Periodically go over the list of lazyPersist files with missing 4669 * blocks and unlink them from the namespace. 4670 */ 4671 private void clearCorruptLazyPersistFiles() 4672 throws IOException { 4673 4674 BlockStoragePolicy lpPolicy = blockManager.getStoragePolicy("LAZY_PERSIST"); 4675 4676 List<BlockCollection> filesToDelete = new ArrayList<>(); 4677 boolean changed = false; 4678 writeLock(); 4679 try { 4680 final Iterator<Block> it = blockManager.getCorruptReplicaBlockIterator(); 4681 4682 while (it.hasNext()) { 4683 Block b = it.next(); 4684 BlockInfoContiguous blockInfo = blockManager.getStoredBlock(b); 4685 if (blockInfo == null) { 4686 LOG.info("Cannot find block info for block " + b); 4687 } else { 4688 if (blockInfo.getBlockCollection().getStoragePolicyID() 4689 == lpPolicy.getId()) { 4690 filesToDelete.add(blockInfo.getBlockCollection()); 4691 } 4692 } 4693 } 4694 4695 for (BlockCollection bc : filesToDelete) { 4696 LOG.warn("Removing lazyPersist file " + bc.getName() + " with no replicas."); 4697 BlocksMapUpdateInfo toRemoveBlocks = 4698 FSDirDeleteOp.deleteInternal( 4699 FSNamesystem.this, bc.getName(), 4700 INodesInPath.fromINode((INodeFile) bc), false); 4701 changed |= toRemoveBlocks != null; 4702 if (toRemoveBlocks != null) { 4703 removeBlocks(toRemoveBlocks); // Incremental deletion of blocks 4704 } 4705 } 4706 } finally { 4707 writeUnlock("clearCorruptLazyPersistFiles"); 4708 } 4709 if (changed) { 4710 getEditLog().logSync(); 4711 } 4712 } 4713 4714 @Override 4715 public void run() { 4716 while (fsRunning && shouldRun) { 4717 try { 4718 clearCorruptLazyPersistFiles(); 4719 } catch (Exception e) { 4720 FSNamesystem.LOG.error( 4721 "Ignoring exception in LazyPersistFileScrubber:", e); 4722 } 4723 4724 try { 4725 Thread.sleep(scrubIntervalSec * 1000); 4726 } catch (InterruptedException e) { 4727 FSNamesystem.LOG.info( 4728 "LazyPersistFileScrubber was interrupted, exiting"); 4729 break; 4730 } 4731 } 4732 } 4733 4734 public void stop() { 4735 shouldRun = false; 4736 } 4737 } 4738 4739 public FSImage getFSImage() { 4740 return fsImage; 4741 } 4742 4743 public FSEditLog getEditLog() { 4744 return getFSImage().getEditLog(); 4745 } 4746 4747 private void checkBlock(ExtendedBlock block) throws IOException { 4748 if (block != null && !this.blockPoolId.equals(block.getBlockPoolId())) { 4749 throw new IOException("Unexpected BlockPoolId " + block.getBlockPoolId() 4750 + " - expected " + blockPoolId); 4751 } 4752 } 4753 4754 @Metric({"MissingBlocks", "Number of missing blocks"}) 4755 public long getMissingBlocksCount() { 4756 // not locking 4757 return blockManager.getMissingBlocksCount(); 4758 } 4759 4760 @Metric({"MissingReplOneBlocks", "Number of missing blocks " + 4761 "with replication factor 1"}) 4762 public long getMissingReplOneBlocksCount() { 4763 // not locking 4764 return blockManager.getMissingReplOneBlocksCount(); 4765 } 4766 4767 @Metric({"ExpiredHeartbeats", "Number of expired heartbeats"}) 4768 public int getExpiredHeartbeats() { 4769 return datanodeStatistics.getExpiredHeartbeats(); 4770 } 4771 4772 @Metric({"TransactionsSinceLastCheckpoint", 4773 "Number of transactions since last checkpoint"}) 4774 public long getTransactionsSinceLastCheckpoint() { 4775 return getEditLog().getLastWrittenTxIdWithoutLock() - 4776 getFSImage().getStorage().getMostRecentCheckpointTxId(); 4777 } 4778 4779 @Metric({"TransactionsSinceLastLogRoll", 4780 "Number of transactions since last edit log roll"}) 4781 public long getTransactionsSinceLastLogRoll() { 4782 if (isInStandbyState() || !getEditLog().isSegmentOpenWithoutLock()) { 4783 return 0; 4784 } else { 4785 return getEditLog().getLastWrittenTxIdWithoutLock() - 4786 getEditLog().getCurSegmentTxIdWithoutLock() + 1; 4787 } 4788 } 4789 4790 @Metric({"LastWrittenTransactionId", "Transaction ID written to the edit log"}) 4791 public long getLastWrittenTransactionId() { 4792 return getEditLog().getLastWrittenTxIdWithoutLock(); 4793 } 4794 4795 @Metric({"LastCheckpointTime", 4796 "Time in milliseconds since the epoch of the last checkpoint"}) 4797 public long getLastCheckpointTime() { 4798 return getFSImage().getStorage().getMostRecentCheckpointTime(); 4799 } 4800 4801 /** @see ClientProtocol#getStats() */ 4802 long[] getStats() { 4803 final long[] stats = datanodeStatistics.getStats(); 4804 stats[ClientProtocol.GET_STATS_UNDER_REPLICATED_IDX] = getUnderReplicatedBlocks(); 4805 stats[ClientProtocol.GET_STATS_CORRUPT_BLOCKS_IDX] = getCorruptReplicaBlocks(); 4806 stats[ClientProtocol.GET_STATS_MISSING_BLOCKS_IDX] = getMissingBlocksCount(); 4807 stats[ClientProtocol.GET_STATS_MISSING_REPL_ONE_BLOCKS_IDX] = 4808 getMissingReplOneBlocksCount(); 4809 return stats; 4810 } 4811 4812 @Override // FSNamesystemMBean 4813 @Metric({"CapacityTotal", 4814 "Total raw capacity of data nodes in bytes"}) 4815 public long getCapacityTotal() { 4816 return datanodeStatistics.getCapacityTotal(); 4817 } 4818 4819 @Metric({"CapacityTotalGB", 4820 "Total raw capacity of data nodes in GB"}) 4821 public float getCapacityTotalGB() { 4822 return DFSUtil.roundBytesToGB(getCapacityTotal()); 4823 } 4824 4825 @Override // FSNamesystemMBean 4826 @Metric({"CapacityUsed", 4827 "Total used capacity across all data nodes in bytes"}) 4828 public long getCapacityUsed() { 4829 return datanodeStatistics.getCapacityUsed(); 4830 } 4831 4832 @Metric({"CapacityUsedGB", 4833 "Total used capacity across all data nodes in GB"}) 4834 public float getCapacityUsedGB() { 4835 return DFSUtil.roundBytesToGB(getCapacityUsed()); 4836 } 4837 4838 @Override // FSNamesystemMBean 4839 @Metric({"CapacityRemaining", "Remaining capacity in bytes"}) 4840 public long getCapacityRemaining() { 4841 return datanodeStatistics.getCapacityRemaining(); 4842 } 4843 4844 @Metric({"CapacityRemainingGB", "Remaining capacity in GB"}) 4845 public float getCapacityRemainingGB() { 4846 return DFSUtil.roundBytesToGB(getCapacityRemaining()); 4847 } 4848 4849 @Metric({"CapacityUsedNonDFS", 4850 "Total space used by data nodes for non DFS purposes in bytes"}) 4851 public long getCapacityUsedNonDFS() { 4852 return datanodeStatistics.getCapacityUsedNonDFS(); 4853 } 4854 4855 /** 4856 * Total number of connections. 4857 */ 4858 @Override // FSNamesystemMBean 4859 @Metric 4860 public int getTotalLoad() { 4861 return datanodeStatistics.getXceiverCount(); 4862 } 4863 4864 @Metric({ "SnapshottableDirectories", "Number of snapshottable directories" }) 4865 public int getNumSnapshottableDirs() { 4866 return this.snapshotManager.getNumSnapshottableDirs(); 4867 } 4868 4869 @Metric({ "Snapshots", "The number of snapshots" }) 4870 public int getNumSnapshots() { 4871 return this.snapshotManager.getNumSnapshots(); 4872 } 4873 4874 @Override 4875 public String getSnapshotStats() { 4876 Map<String, Object> info = new HashMap<String, Object>(); 4877 info.put("SnapshottableDirectories", this.getNumSnapshottableDirs()); 4878 info.put("Snapshots", this.getNumSnapshots()); 4879 return JSON.toString(info); 4880 } 4881 4882 @Override // FSNamesystemMBean 4883 @Metric({ "NumEncryptionZones", "The number of encryption zones" }) 4884 public int getNumEncryptionZones() { 4885 return dir.ezManager.getNumEncryptionZones(); 4886 } 4887 4888 /** 4889 * Returns the length of the wait Queue for the FSNameSystemLock. 4890 * 4891 * A larger number here indicates lots of threads are waiting for 4892 * FSNameSystemLock. 4893 * 4894 * @return int - Number of Threads waiting to acquire FSNameSystemLock 4895 */ 4896 @Override 4897 @Metric({"LockQueueLength", "Number of threads waiting to " + 4898 "acquire FSNameSystemLock"}) 4899 public int getFsLockQueueLength() { 4900 return fsLock.getQueueLength(); 4901 } 4902 4903 int getNumberOfDatanodes(DatanodeReportType type) { 4904 readLock(); 4905 try { 4906 return getBlockManager().getDatanodeManager().getDatanodeListForReport( 4907 type).size(); 4908 } finally { 4909 readUnlock("getNumberOfDatanodes"); 4910 } 4911 } 4912 4913 DatanodeInfo[] datanodeReport(final DatanodeReportType type 4914 ) throws AccessControlException, StandbyException { 4915 checkSuperuserPrivilege(); 4916 checkOperation(OperationCategory.UNCHECKED); 4917 readLock(); 4918 try { 4919 checkOperation(OperationCategory.UNCHECKED); 4920 final DatanodeManager dm = getBlockManager().getDatanodeManager(); 4921 final List<DatanodeDescriptor> results = dm.getDatanodeListForReport(type); 4922 4923 DatanodeInfo[] arr = new DatanodeInfo[results.size()]; 4924 for (int i=0; i<arr.length; i++) { 4925 arr[i] = new DatanodeInfo(results.get(i)); 4926 } 4927 return arr; 4928 } finally { 4929 readUnlock("datanodeReport"); 4930 } 4931 } 4932 4933 DatanodeStorageReport[] getDatanodeStorageReport(final DatanodeReportType type 4934 ) throws AccessControlException, StandbyException { 4935 checkSuperuserPrivilege(); 4936 checkOperation(OperationCategory.UNCHECKED); 4937 readLock(); 4938 try { 4939 checkOperation(OperationCategory.UNCHECKED); 4940 final DatanodeManager dm = getBlockManager().getDatanodeManager(); 4941 final List<DatanodeDescriptor> datanodes = dm.getDatanodeListForReport(type); 4942 4943 DatanodeStorageReport[] reports = new DatanodeStorageReport[datanodes.size()]; 4944 for (int i = 0; i < reports.length; i++) { 4945 final DatanodeDescriptor d = datanodes.get(i); 4946 reports[i] = new DatanodeStorageReport(new DatanodeInfo(d), 4947 d.getStorageReports()); 4948 } 4949 return reports; 4950 } finally { 4951 readUnlock("getDatanodeStorageReport"); 4952 } 4953 } 4954 4955 /** 4956 * Save namespace image. 4957 * This will save current namespace into fsimage file and empty edits file. 4958 * Requires superuser privilege and safe mode. 4959 * 4960 * @throws AccessControlException if superuser privilege is violated. 4961 * @throws IOException if 4962 */ 4963 void saveNamespace() throws AccessControlException, IOException { 4964 checkOperation(OperationCategory.UNCHECKED); 4965 checkSuperuserPrivilege(); 4966 4967 cpLock(); // Block if a checkpointing is in progress on standby. 4968 readLock(); 4969 try { 4970 checkOperation(OperationCategory.UNCHECKED); 4971 4972 if (!isInSafeMode()) { 4973 throw new IOException("Safe mode should be turned ON " 4974 + "in order to create namespace image."); 4975 } 4976 getFSImage().saveNamespace(this); 4977 } finally { 4978 readUnlock("saveNamespace"); 4979 cpUnlock(); 4980 } 4981 LOG.info("New namespace image has been created"); 4982 } 4983 4984 /** 4985 * Enables/Disables/Checks restoring failed storage replicas if the storage becomes available again. 4986 * Requires superuser privilege. 4987 * 4988 * @throws AccessControlException if superuser privilege is violated. 4989 */ 4990 boolean restoreFailedStorage(String arg) throws AccessControlException, 4991 StandbyException { 4992 checkSuperuserPrivilege(); 4993 checkOperation(OperationCategory.UNCHECKED); 4994 cpLock(); // Block if a checkpointing is in progress on standby. 4995 writeLock(); 4996 try { 4997 checkOperation(OperationCategory.UNCHECKED); 4998 4999 // if it is disabled - enable it and vice versa. 5000 if(arg.equals("check")) 5001 return getFSImage().getStorage().getRestoreFailedStorage(); 5002 5003 boolean val = arg.equals("true"); // false if not 5004 getFSImage().getStorage().setRestoreFailedStorage(val); 5005 5006 return val; 5007 } finally { 5008 writeUnlock("restoreFailedStorage"); 5009 cpUnlock(); 5010 } 5011 } 5012 5013 Date getStartTime() { 5014 return new Date(startTime); 5015 } 5016 5017 void finalizeUpgrade() throws IOException { 5018 checkSuperuserPrivilege(); 5019 checkOperation(OperationCategory.UNCHECKED); 5020 cpLock(); // Block if a checkpointing is in progress on standby. 5021 writeLock(); 5022 try { 5023 checkOperation(OperationCategory.UNCHECKED); 5024 getFSImage().finalizeUpgrade(this.isHaEnabled() && inActiveState()); 5025 } finally { 5026 writeUnlock("finalizeUpgrade"); 5027 cpUnlock(); 5028 } 5029 } 5030 5031 void refreshNodes() throws IOException { 5032 checkOperation(OperationCategory.UNCHECKED); 5033 checkSuperuserPrivilege(); 5034 getBlockManager().getDatanodeManager().refreshNodes(new HdfsConfiguration()); 5035 } 5036 5037 void setBalancerBandwidth(long bandwidth) throws IOException { 5038 checkOperation(OperationCategory.UNCHECKED); 5039 checkSuperuserPrivilege(); 5040 getBlockManager().getDatanodeManager().setBalancerBandwidth(bandwidth); 5041 } 5042 5043 /** 5044 * Persist the new block (the last block of the given file). 5045 * @param path 5046 * @param file 5047 */ 5048 private void persistNewBlock(String path, INodeFile file) { 5049 Preconditions.checkArgument(file.isUnderConstruction()); 5050 getEditLog().logAddBlock(path, file); 5051 NameNode.stateChangeLog.debug("persistNewBlock: {} with new block {}," + 5052 " current total block count is {}", path, 5053 file.getLastBlock().toString(), file.getBlocks().length); 5054 } 5055 5056 /** 5057 * SafeModeInfo contains information related to the safe mode. 5058 * <p> 5059 * An instance of {@link SafeModeInfo} is created when the name node 5060 * enters safe mode. 5061 * <p> 5062 * During name node startup {@link SafeModeInfo} counts the number of 5063 * <em>safe blocks</em>, those that have at least the minimal number of 5064 * replicas, and calculates the ratio of safe blocks to the total number 5065 * of blocks in the system, which is the size of blocks in 5066 * {@link FSNamesystem#blockManager}. When the ratio reaches the 5067 * {@link #threshold} it starts the SafeModeMonitor daemon in order 5068 * to monitor whether the safe mode {@link #extension} is passed. 5069 * Then it leaves safe mode and destroys itself. 5070 * <p> 5071 * If safe mode is turned on manually then the number of safe blocks is 5072 * not tracked because the name node is not intended to leave safe mode 5073 * automatically in the case. 5074 * 5075 * @see ClientProtocol#setSafeMode(HdfsConstants.SafeModeAction, boolean) 5076 */ 5077 public class SafeModeInfo { 5078 // configuration fields 5079 /** Safe mode threshold condition %.*/ 5080 private final double threshold; 5081 /** Safe mode minimum number of datanodes alive */ 5082 private final int datanodeThreshold; 5083 /** 5084 * Safe mode extension after the threshold. 5085 * Make it volatile so that getSafeModeTip can read the latest value 5086 * without taking a lock. 5087 */ 5088 private volatile int extension; 5089 /** Min replication required by safe mode. */ 5090 private final int safeReplication; 5091 /** threshold for populating needed replication queues */ 5092 private final double replQueueThreshold; 5093 // internal fields 5094 /** Time when threshold was reached. 5095 * <br> -1 safe mode is off 5096 * <br> 0 safe mode is on, and threshold is not reached yet 5097 * <br> >0 safe mode is on, but we are in extension period 5098 */ 5099 private long reached = -1; 5100 private long reachedTimestamp = -1; 5101 /** Total number of blocks. */ 5102 int blockTotal; 5103 /** Number of safe blocks. */ 5104 int blockSafe; 5105 /** Number of blocks needed to satisfy safe mode threshold condition */ 5106 private int blockThreshold; 5107 /** Number of blocks needed before populating replication queues */ 5108 private int blockReplQueueThreshold; 5109 /** time of the last status printout */ 5110 private long lastStatusReport = 0; 5111 /** 5112 * Was safemode entered automatically because available resources were low. 5113 * Make it volatile so that getSafeModeTip can read the latest value 5114 * without taking a lock. 5115 */ 5116 private volatile boolean resourcesLow = false; 5117 /** Should safemode adjust its block totals as blocks come in */ 5118 private boolean shouldIncrementallyTrackBlocks = false; 5119 /** counter for tracking startup progress of reported blocks */ 5120 private Counter awaitingReportedBlocksCounter; 5121 5122 /** 5123 * Creates SafeModeInfo when the name node enters 5124 * automatic safe mode at startup. 5125 * 5126 * @param conf configuration 5127 */ 5128 private SafeModeInfo(Configuration conf) { 5129 this.threshold = conf.getFloat(DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_KEY, 5130 DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_DEFAULT); 5131 if(threshold > 1.0) { 5132 LOG.warn("The threshold value should't be greater than 1, threshold: " + threshold); 5133 } 5134 this.datanodeThreshold = conf.getInt( 5135 DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY, 5136 DFS_NAMENODE_SAFEMODE_MIN_DATANODES_DEFAULT); 5137 this.extension = conf.getInt(DFS_NAMENODE_SAFEMODE_EXTENSION_KEY, 0); 5138 this.safeReplication = conf.getInt(DFS_NAMENODE_REPLICATION_MIN_KEY, 5139 DFS_NAMENODE_REPLICATION_MIN_DEFAULT); 5140 5141 LOG.info(DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_KEY + " = " + threshold); 5142 LOG.info(DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY + " = " + datanodeThreshold); 5143 LOG.info(DFS_NAMENODE_SAFEMODE_EXTENSION_KEY + " = " + extension); 5144 5145 // default to safe mode threshold (i.e., don't populate queues before leaving safe mode) 5146 this.replQueueThreshold = 5147 conf.getFloat(DFS_NAMENODE_REPL_QUEUE_THRESHOLD_PCT_KEY, 5148 (float) threshold); 5149 this.blockTotal = 0; 5150 this.blockSafe = 0; 5151 } 5152 5153 /** 5154 * In the HA case, the StandbyNode can be in safemode while the namespace 5155 * is modified by the edit log tailer. In this case, the number of total 5156 * blocks changes as edits are processed (eg blocks are added and deleted). 5157 * However, we don't want to do the incremental tracking during the 5158 * startup-time loading process -- only once the initial total has been 5159 * set after the image has been loaded. 5160 */ 5161 private boolean shouldIncrementallyTrackBlocks() { 5162 return shouldIncrementallyTrackBlocks; 5163 } 5164 5165 /** 5166 * Creates SafeModeInfo when safe mode is entered manually, or because 5167 * available resources are low. 5168 * 5169 * The {@link #threshold} is set to 1.5 so that it could never be reached. 5170 * {@link #blockTotal} is set to -1 to indicate that safe mode is manual. 5171 * 5172 * @see SafeModeInfo 5173 */ 5174 private SafeModeInfo(boolean resourcesLow) { 5175 this.threshold = 1.5f; // this threshold can never be reached 5176 this.datanodeThreshold = Integer.MAX_VALUE; 5177 this.extension = Integer.MAX_VALUE; 5178 this.safeReplication = Short.MAX_VALUE + 1; // more than maxReplication 5179 this.replQueueThreshold = 1.5f; // can never be reached 5180 this.blockTotal = -1; 5181 this.blockSafe = -1; 5182 this.resourcesLow = resourcesLow; 5183 enter(); 5184 reportStatus("STATE* Safe mode is ON.", true); 5185 } 5186 5187 /** 5188 * Check if safe mode is on. 5189 * @return true if in safe mode 5190 */ 5191 private synchronized boolean isOn() { 5192 doConsistencyCheck(); 5193 return this.reached >= 0; 5194 } 5195 5196 /** 5197 * Enter safe mode. 5198 */ 5199 private void enter() { 5200 this.reached = 0; 5201 this.reachedTimestamp = 0; 5202 } 5203 5204 /** 5205 * Leave safe mode. 5206 * <p> 5207 * Check for invalid, under- & over-replicated blocks in the end of startup. 5208 */ 5209 private synchronized void leave() { 5210 // if not done yet, initialize replication queues. 5211 // In the standby, do not populate repl queues 5212 if (!isPopulatingReplQueues() && shouldPopulateReplQueues()) { 5213 initializeReplQueues(); 5214 } 5215 long timeInSafemode = now() - startTime; 5216 NameNode.stateChangeLog.info("STATE* Leaving safe mode after " 5217 + timeInSafemode/1000 + " secs"); 5218 NameNode.getNameNodeMetrics().setSafeModeTime((int) timeInSafemode); 5219 5220 //Log the following only once (when transitioning from ON -> OFF) 5221 if (reached >= 0) { 5222 NameNode.stateChangeLog.info("STATE* Safe mode is OFF"); 5223 } 5224 reached = -1; 5225 reachedTimestamp = -1; 5226 safeMode = null; 5227 final NetworkTopology nt = blockManager.getDatanodeManager().getNetworkTopology(); 5228 NameNode.stateChangeLog.info("STATE* Network topology has " 5229 + nt.getNumOfRacks() + " racks and " 5230 + nt.getNumOfLeaves() + " datanodes"); 5231 NameNode.stateChangeLog.info("STATE* UnderReplicatedBlocks has " 5232 + blockManager.numOfUnderReplicatedBlocks() + " blocks"); 5233 5234 startSecretManagerIfNecessary(); 5235 5236 // If startup has not yet completed, end safemode phase. 5237 StartupProgress prog = NameNode.getStartupProgress(); 5238 if (prog.getStatus(Phase.SAFEMODE) != Status.COMPLETE) { 5239 prog.endStep(Phase.SAFEMODE, STEP_AWAITING_REPORTED_BLOCKS); 5240 prog.endPhase(Phase.SAFEMODE); 5241 } 5242 } 5243 5244 /** 5245 * Check whether we have reached the threshold for 5246 * initializing replication queues. 5247 */ 5248 private synchronized boolean canInitializeReplQueues() { 5249 return shouldPopulateReplQueues() 5250 && blockSafe >= blockReplQueueThreshold; 5251 } 5252 5253 /** 5254 * Safe mode can be turned off iff 5255 * the threshold is reached and 5256 * the extension time have passed. 5257 * @return true if can leave or false otherwise. 5258 */ 5259 private synchronized boolean canLeave() { 5260 if (reached == 0) { 5261 return false; 5262 } 5263 5264 if (monotonicNow() - reached < extension) { 5265 reportStatus("STATE* Safe mode ON, in safe mode extension.", false); 5266 return false; 5267 } 5268 5269 if (needEnter()) { 5270 reportStatus("STATE* Safe mode ON, thresholds not met.", false); 5271 return false; 5272 } 5273 5274 return true; 5275 } 5276 5277 /** 5278 * There is no need to enter safe mode 5279 * if DFS is empty or {@link #threshold} == 0 5280 */ 5281 private boolean needEnter() { 5282 return (threshold != 0 && blockSafe < blockThreshold) || 5283 (datanodeThreshold != 0 && getNumLiveDataNodes() < datanodeThreshold) || 5284 (!nameNodeHasResourcesAvailable()); 5285 } 5286 5287 /** 5288 * Check and trigger safe mode if needed. 5289 */ 5290 private void checkMode() { 5291 // Have to have write-lock since leaving safemode initializes 5292 // repl queues, which requires write lock 5293 assert hasWriteLock(); 5294 if (inTransitionToActive()) { 5295 return; 5296 } 5297 // if smmthread is already running, the block threshold must have been 5298 // reached before, there is no need to enter the safe mode again 5299 if (smmthread == null && needEnter()) { 5300 enter(); 5301 // check if we are ready to initialize replication queues 5302 if (canInitializeReplQueues() && !isPopulatingReplQueues() 5303 && !haEnabled) { 5304 initializeReplQueues(); 5305 } 5306 reportStatus("STATE* Safe mode ON.", false); 5307 return; 5308 } 5309 // the threshold is reached or was reached before 5310 if (!isOn() || // safe mode is off 5311 extension <= 0 || threshold <= 0) { // don't need to wait 5312 this.leave(); // leave safe mode 5313 return; 5314 } 5315 if (reached > 0) { // threshold has already been reached before 5316 reportStatus("STATE* Safe mode ON.", false); 5317 return; 5318 } 5319 // start monitor 5320 reached = monotonicNow(); 5321 reachedTimestamp = now(); 5322 if (smmthread == null) { 5323 smmthread = new Daemon(new SafeModeMonitor()); 5324 smmthread.start(); 5325 reportStatus("STATE* Safe mode extension entered.", true); 5326 } 5327 5328 // check if we are ready to initialize replication queues 5329 if (canInitializeReplQueues() && !isPopulatingReplQueues() && !haEnabled) { 5330 initializeReplQueues(); 5331 } 5332 } 5333 5334 /** 5335 * Set total number of blocks. 5336 */ 5337 private synchronized void setBlockTotal(int total) { 5338 this.blockTotal = total; 5339 this.blockThreshold = (int) (blockTotal * threshold); 5340 this.blockReplQueueThreshold = 5341 (int) (blockTotal * replQueueThreshold); 5342 if (haEnabled) { 5343 // After we initialize the block count, any further namespace 5344 // modifications done while in safe mode need to keep track 5345 // of the number of total blocks in the system. 5346 this.shouldIncrementallyTrackBlocks = true; 5347 } 5348 if(blockSafe < 0) 5349 this.blockSafe = 0; 5350 checkMode(); 5351 } 5352 5353 /** 5354 * Increment number of safe blocks if current block has 5355 * reached minimal replication. 5356 * @param replication current replication 5357 */ 5358 private synchronized void incrementSafeBlockCount(short replication) { 5359 if (replication == safeReplication) { 5360 this.blockSafe++; 5361 5362 // Report startup progress only if we haven't completed startup yet. 5363 StartupProgress prog = NameNode.getStartupProgress(); 5364 if (prog.getStatus(Phase.SAFEMODE) != Status.COMPLETE) { 5365 if (this.awaitingReportedBlocksCounter == null) { 5366 this.awaitingReportedBlocksCounter = prog.getCounter(Phase.SAFEMODE, 5367 STEP_AWAITING_REPORTED_BLOCKS); 5368 } 5369 this.awaitingReportedBlocksCounter.increment(); 5370 } 5371 5372 checkMode(); 5373 } 5374 } 5375 5376 /** 5377 * Decrement number of safe blocks if current block has 5378 * fallen below minimal replication. 5379 * @param replication current replication 5380 */ 5381 private synchronized void decrementSafeBlockCount(short replication) { 5382 if (replication == safeReplication-1) { 5383 this.blockSafe--; 5384 //blockSafe is set to -1 in manual / low resources safemode 5385 assert blockSafe >= 0 || isManual() || areResourcesLow(); 5386 checkMode(); 5387 } 5388 } 5389 5390 /** 5391 * Check if safe mode was entered manually 5392 */ 5393 private boolean isManual() { 5394 return extension == Integer.MAX_VALUE; 5395 } 5396 5397 /** 5398 * Set manual safe mode. 5399 */ 5400 private synchronized void setManual() { 5401 extension = Integer.MAX_VALUE; 5402 } 5403 5404 /** 5405 * Check if safe mode was entered due to resources being low. 5406 */ 5407 private boolean areResourcesLow() { 5408 return resourcesLow; 5409 } 5410 5411 /** 5412 * Set that resources are low for this instance of safe mode. 5413 */ 5414 private void setResourcesLow() { 5415 resourcesLow = true; 5416 } 5417 5418 /** 5419 * A tip on how safe mode is to be turned off: manually or automatically. 5420 */ 5421 String getTurnOffTip() { 5422 if(!isOn()) { 5423 return "Safe mode is OFF."; 5424 } 5425 5426 //Manual OR low-resource safemode. (Admin intervention required) 5427 String adminMsg = "It was turned on manually. "; 5428 if (areResourcesLow()) { 5429 adminMsg = "Resources are low on NN. Please add or free up more " 5430 + "resources then turn off safe mode manually. NOTE: If you turn off" 5431 + " safe mode before adding resources, " 5432 + "the NN will immediately return to safe mode. "; 5433 } 5434 if (isManual() || areResourcesLow()) { 5435 return adminMsg 5436 + "Use \"hdfs dfsadmin -safemode leave\" to turn safe mode off."; 5437 } 5438 5439 boolean thresholdsMet = true; 5440 int numLive = getNumLiveDataNodes(); 5441 String msg = ""; 5442 if (blockSafe < blockThreshold) { 5443 msg += String.format( 5444 "The reported blocks %d needs additional %d" 5445 + " blocks to reach the threshold %.4f of total blocks %d.%n", 5446 blockSafe, (blockThreshold - blockSafe) + 1, threshold, blockTotal); 5447 thresholdsMet = false; 5448 } else { 5449 msg += String.format("The reported blocks %d has reached the threshold" 5450 + " %.4f of total blocks %d. ", blockSafe, threshold, blockTotal); 5451 } 5452 if (numLive < datanodeThreshold) { 5453 msg += String.format( 5454 "The number of live datanodes %d needs an additional %d live " 5455 + "datanodes to reach the minimum number %d.%n", 5456 numLive, (datanodeThreshold - numLive), datanodeThreshold); 5457 thresholdsMet = false; 5458 } else { 5459 msg += String.format("The number of live datanodes %d has reached " 5460 + "the minimum number %d. ", 5461 numLive, datanodeThreshold); 5462 } 5463 msg += (reached > 0) ? "In safe mode extension. " : ""; 5464 msg += "Safe mode will be turned off automatically "; 5465 5466 if (!thresholdsMet) { 5467 msg += "once the thresholds have been reached."; 5468 } else if (reached + extension - monotonicNow() > 0) { 5469 msg += ("in " + (reached + extension - monotonicNow()) / 1000 + " seconds."); 5470 } else { 5471 msg += "soon."; 5472 } 5473 5474 return msg; 5475 } 5476 5477 /** 5478 * Print status every 20 seconds. 5479 */ 5480 private void reportStatus(String msg, boolean rightNow) { 5481 long curTime = now(); 5482 if(!rightNow && (curTime - lastStatusReport < 20 * 1000)) 5483 return; 5484 NameNode.stateChangeLog.info(msg + " \n" + getTurnOffTip()); 5485 lastStatusReport = curTime; 5486 } 5487 5488 @Override 5489 public String toString() { 5490 String resText = "Current safe blocks = " 5491 + blockSafe 5492 + ". Target blocks = " + blockThreshold + " for threshold = %" + threshold 5493 + ". Minimal replication = " + safeReplication + "."; 5494 if (reached > 0) 5495 resText += " Threshold was reached " + new Date(reachedTimestamp) + "."; 5496 return resText; 5497 } 5498 5499 /** 5500 * Checks consistency of the class state. 5501 * This is costly so only runs if asserts are enabled. 5502 */ 5503 private void doConsistencyCheck() { 5504 boolean assertsOn = false; 5505 assert assertsOn = true; // set to true if asserts are on 5506 if (!assertsOn) return; 5507 5508 if (blockTotal == -1 && blockSafe == -1) { 5509 return; // manual safe mode 5510 } 5511 int activeBlocks = blockManager.getActiveBlockCount(); 5512 if ((blockTotal != activeBlocks) && 5513 !(blockSafe >= 0 && blockSafe <= blockTotal)) { 5514 throw new AssertionError( 5515 " SafeMode: Inconsistent filesystem state: " 5516 + "SafeMode data: blockTotal=" + blockTotal 5517 + " blockSafe=" + blockSafe + "; " 5518 + "BlockManager data: active=" + activeBlocks); 5519 } 5520 } 5521 5522 private synchronized void adjustBlockTotals(int deltaSafe, int deltaTotal) { 5523 if (!shouldIncrementallyTrackBlocks) { 5524 return; 5525 } 5526 assert haEnabled; 5527 5528 if (LOG.isDebugEnabled()) { 5529 LOG.debug("Adjusting block totals from " + 5530 blockSafe + "/" + blockTotal + " to " + 5531 (blockSafe + deltaSafe) + "/" + (blockTotal + deltaTotal)); 5532 } 5533 assert blockSafe + deltaSafe >= 0 : "Can't reduce blockSafe " + 5534 blockSafe + " by " + deltaSafe + ": would be negative"; 5535 assert blockTotal + deltaTotal >= 0 : "Can't reduce blockTotal " + 5536 blockTotal + " by " + deltaTotal + ": would be negative"; 5537 5538 blockSafe += deltaSafe; 5539 setBlockTotal(blockTotal + deltaTotal); 5540 } 5541 } 5542 5543 /** 5544 * Periodically check whether it is time to leave safe mode. 5545 * This thread starts when the threshold level is reached. 5546 * 5547 */ 5548 class SafeModeMonitor implements Runnable { 5549 /** interval in msec for checking safe mode: {@value} */ 5550 private static final long recheckInterval = 1000; 5551 5552 /** 5553 */ 5554 @Override 5555 public void run() { 5556 while (fsRunning) { 5557 writeLock(); 5558 try { 5559 if (safeMode == null) { // Not in safe mode. 5560 break; 5561 } 5562 if (safeMode.canLeave()) { 5563 // Leave safe mode. 5564 safeMode.leave(); 5565 smmthread = null; 5566 break; 5567 } 5568 } finally { 5569 writeUnlock(); 5570 } 5571 5572 try { 5573 Thread.sleep(recheckInterval); 5574 } catch (InterruptedException ie) { 5575 // Ignored 5576 } 5577 } 5578 if (!fsRunning) { 5579 LOG.info("NameNode is being shutdown, exit SafeModeMonitor thread"); 5580 } 5581 } 5582 } 5583 5584 boolean setSafeMode(SafeModeAction action) throws IOException { 5585 if (action != SafeModeAction.SAFEMODE_GET) { 5586 checkSuperuserPrivilege(); 5587 switch(action) { 5588 case SAFEMODE_LEAVE: // leave safe mode 5589 leaveSafeMode(); 5590 break; 5591 case SAFEMODE_ENTER: // enter safe mode 5592 enterSafeMode(false); 5593 break; 5594 default: 5595 LOG.error("Unexpected safe mode action"); 5596 } 5597 } 5598 return isInSafeMode(); 5599 } 5600 5601 @Override 5602 public void checkSafeMode() { 5603 // safeMode is volatile, and may be set to null at any time 5604 SafeModeInfo safeMode = this.safeMode; 5605 if (safeMode != null) { 5606 safeMode.checkMode(); 5607 } 5608 } 5609 5610 @Override 5611 public boolean isInSafeMode() { 5612 // safeMode is volatile, and may be set to null at any time 5613 SafeModeInfo safeMode = this.safeMode; 5614 if (safeMode == null) 5615 return false; 5616 return safeMode.isOn(); 5617 } 5618 5619 @Override 5620 public boolean isInStartupSafeMode() { 5621 // safeMode is volatile, and may be set to null at any time 5622 SafeModeInfo safeMode = this.safeMode; 5623 if (safeMode == null) 5624 return false; 5625 // If the NN is in safemode, and not due to manual / low resources, we 5626 // assume it must be because of startup. If the NN had low resources during 5627 // startup, we assume it came out of startup safemode and it is now in low 5628 // resources safemode 5629 return !safeMode.isManual() && !safeMode.areResourcesLow() 5630 && safeMode.isOn(); 5631 } 5632 5633 /** 5634 * Check if replication queues are to be populated 5635 * @return true when node is HAState.Active and not in the very first safemode 5636 */ 5637 @Override 5638 public boolean isPopulatingReplQueues() { 5639 if (!shouldPopulateReplQueues()) { 5640 return false; 5641 } 5642 return initializedReplQueues; 5643 } 5644 5645 private boolean shouldPopulateReplQueues() { 5646 if(haContext == null || haContext.getState() == null) 5647 return false; 5648 return haContext.getState().shouldPopulateReplQueues(); 5649 } 5650 5651 @Override 5652 public void incrementSafeBlockCount(int replication) { 5653 // safeMode is volatile, and may be set to null at any time 5654 SafeModeInfo safeMode = this.safeMode; 5655 if (safeMode == null) 5656 return; 5657 safeMode.incrementSafeBlockCount((short)replication); 5658 } 5659 5660 @Override 5661 public void decrementSafeBlockCount(Block b) { 5662 // safeMode is volatile, and may be set to null at any time 5663 SafeModeInfo safeMode = this.safeMode; 5664 if (safeMode == null) // mostly true 5665 return; 5666 BlockInfoContiguous storedBlock = getStoredBlock(b); 5667 if (storedBlock.isComplete()) { 5668 safeMode.decrementSafeBlockCount((short)blockManager.countNodes(b).liveReplicas()); 5669 } 5670 } 5671 5672 /** 5673 * Adjust the total number of blocks safe and expected during safe mode. 5674 * If safe mode is not currently on, this is a no-op. 5675 * @param deltaSafe the change in number of safe blocks 5676 * @param deltaTotal the change i nnumber of total blocks expected 5677 */ 5678 @Override 5679 public void adjustSafeModeBlockTotals(int deltaSafe, int deltaTotal) { 5680 // safeMode is volatile, and may be set to null at any time 5681 SafeModeInfo safeMode = this.safeMode; 5682 if (safeMode == null) 5683 return; 5684 safeMode.adjustBlockTotals(deltaSafe, deltaTotal); 5685 } 5686 5687 /** 5688 * Set the total number of blocks in the system. 5689 */ 5690 public void setBlockTotal() { 5691 // safeMode is volatile, and may be set to null at any time 5692 SafeModeInfo safeMode = this.safeMode; 5693 if (safeMode == null) 5694 return; 5695 safeMode.setBlockTotal((int)getCompleteBlocksTotal()); 5696 } 5697 5698 /** 5699 * Get the total number of blocks in the system. 5700 */ 5701 @Override // FSNamesystemMBean 5702 @Metric 5703 public long getBlocksTotal() { 5704 return blockManager.getTotalBlocks(); 5705 } 5706 5707 /** 5708 * Get the total number of COMPLETE blocks in the system. 5709 * For safe mode only complete blocks are counted. 5710 */ 5711 private long getCompleteBlocksTotal() { 5712 // Calculate number of blocks under construction 5713 long numUCBlocks = 0; 5714 readLock(); 5715 numUCBlocks = leaseManager.getNumUnderConstructionBlocks(); 5716 try { 5717 return getBlocksTotal() - numUCBlocks; 5718 } finally { 5719 readUnlock("getCompleteBlocksTotal"); 5720 } 5721 } 5722 5723 /** 5724 * Enter safe mode. If resourcesLow is false, then we assume it is manual 5725 * @throws IOException 5726 */ 5727 void enterSafeMode(boolean resourcesLow) throws IOException { 5728 writeLock(); 5729 try { 5730 // Stop the secret manager, since rolling the master key would 5731 // try to write to the edit log 5732 stopSecretManager(); 5733 5734 // Ensure that any concurrent operations have been fully synced 5735 // before entering safe mode. This ensures that the FSImage 5736 // is entirely stable on disk as soon as we're in safe mode. 5737 boolean isEditlogOpenForWrite = getEditLog().isOpenForWrite(); 5738 // Before Editlog is in OpenForWrite mode, editLogStream will be null. So, 5739 // logSyncAll call can be called only when Edlitlog is in OpenForWrite mode 5740 if (isEditlogOpenForWrite) { 5741 getEditLog().logSyncAll(); 5742 } 5743 if (!isInSafeMode()) { 5744 safeMode = new SafeModeInfo(resourcesLow); 5745 return; 5746 } 5747 if (resourcesLow) { 5748 safeMode.setResourcesLow(); 5749 } else { 5750 safeMode.setManual(); 5751 } 5752 if (isEditlogOpenForWrite) { 5753 getEditLog().logSyncAll(); 5754 } 5755 NameNode.stateChangeLog.info("STATE* Safe mode is ON" 5756 + safeMode.getTurnOffTip()); 5757 } finally { 5758 writeUnlock("enterSafeMode"); 5759 } 5760 } 5761 5762 /** 5763 * Leave safe mode. 5764 */ 5765 void leaveSafeMode() { 5766 writeLock(); 5767 try { 5768 if (!isInSafeMode()) { 5769 NameNode.stateChangeLog.info("STATE* Safe mode is already OFF"); 5770 return; 5771 } 5772 safeMode.leave(); 5773 } finally { 5774 writeUnlock("leaveSafeMode"); 5775 } 5776 } 5777 5778 String getSafeModeTip() { 5779 // There is no need to take readLock. 5780 // Don't use isInSafeMode as this.safeMode might be set to null. 5781 // after isInSafeMode returns. 5782 boolean inSafeMode; 5783 SafeModeInfo safeMode = this.safeMode; 5784 if (safeMode == null) { 5785 inSafeMode = false; 5786 } else { 5787 inSafeMode = safeMode.isOn(); 5788 } 5789 5790 if (!inSafeMode) { 5791 return ""; 5792 } else { 5793 return safeMode.getTurnOffTip(); 5794 } 5795 } 5796 5797 CheckpointSignature rollEditLog() throws IOException { 5798 checkSuperuserPrivilege(); 5799 checkOperation(OperationCategory.JOURNAL); 5800 writeLock(); 5801 try { 5802 checkOperation(OperationCategory.JOURNAL); 5803 checkNameNodeSafeMode("Log not rolled"); 5804 if (Server.isRpcInvocation()) { 5805 LOG.info("Roll Edit Log from " + Server.getRemoteAddress()); 5806 } 5807 return getFSImage().rollEditLog(); 5808 } finally { 5809 writeUnlock("rollEditLog"); 5810 } 5811 } 5812 5813 NamenodeCommand startCheckpoint(NamenodeRegistration backupNode, 5814 NamenodeRegistration activeNamenode) throws IOException { 5815 checkOperation(OperationCategory.CHECKPOINT); 5816 writeLock(); 5817 try { 5818 checkOperation(OperationCategory.CHECKPOINT); 5819 checkNameNodeSafeMode("Checkpoint not started"); 5820 5821 LOG.info("Start checkpoint for " + backupNode.getAddress()); 5822 NamenodeCommand cmd = getFSImage().startCheckpoint(backupNode, 5823 activeNamenode); 5824 getEditLog().logSync(); 5825 return cmd; 5826 } finally { 5827 writeUnlock("startCheckpoint"); 5828 } 5829 } 5830 5831 public void processIncrementalBlockReport(final DatanodeID nodeID, 5832 final StorageReceivedDeletedBlocks srdb) 5833 throws IOException { 5834 writeLock(); 5835 try { 5836 blockManager.processIncrementalBlockReport(nodeID, srdb); 5837 } finally { 5838 writeUnlock("processIncrementalBlockReport"); 5839 } 5840 } 5841 5842 void endCheckpoint(NamenodeRegistration registration, 5843 CheckpointSignature sig) throws IOException { 5844 checkOperation(OperationCategory.CHECKPOINT); 5845 readLock(); 5846 try { 5847 checkOperation(OperationCategory.CHECKPOINT); 5848 checkNameNodeSafeMode("Checkpoint not ended"); 5849 LOG.info("End checkpoint for " + registration.getAddress()); 5850 getFSImage().endCheckpoint(sig); 5851 } finally { 5852 readUnlock("endCheckpoint"); 5853 } 5854 } 5855 5856 PermissionStatus createFsOwnerPermissions(FsPermission permission) { 5857 return new PermissionStatus(fsOwner.getShortUserName(), supergroup, permission); 5858 } 5859 5860 private void checkUnreadableBySuperuser(FSPermissionChecker pc, 5861 INode inode, int snapshotId) 5862 throws IOException { 5863 if (pc.isSuperUser()) { 5864 for (XAttr xattr : FSDirXAttrOp.getXAttrs(dir, inode, snapshotId)) { 5865 if (XAttrHelper.getPrefixName(xattr). 5866 equals(SECURITY_XATTR_UNREADABLE_BY_SUPERUSER)) { 5867 throw new AccessControlException("Access is denied for " + 5868 pc.getUser() + " since the superuser is not allowed to " + 5869 "perform this operation."); 5870 } 5871 } 5872 } 5873 } 5874 5875 @Override 5876 public void checkSuperuserPrivilege() 5877 throws AccessControlException { 5878 if (isPermissionEnabled) { 5879 FSPermissionChecker pc = getPermissionChecker(); 5880 pc.checkSuperuserPrivilege(); 5881 } 5882 } 5883 5884 /** 5885 * Check to see if we have exceeded the limit on the number 5886 * of inodes. 5887 */ 5888 void checkFsObjectLimit() throws IOException { 5889 if (maxFsObjects != 0 && 5890 maxFsObjects <= dir.totalInodes() + getBlocksTotal()) { 5891 throw new IOException("Exceeded the configured number of objects " + 5892 maxFsObjects + " in the filesystem."); 5893 } 5894 } 5895 5896 /** 5897 * Get the total number of objects in the system. 5898 */ 5899 @Override // FSNamesystemMBean 5900 public long getMaxObjects() { 5901 return maxFsObjects; 5902 } 5903 5904 @Override // FSNamesystemMBean 5905 @Metric 5906 public long getFilesTotal() { 5907 // There is no need to take fSNamesystem's lock as 5908 // FSDirectory has its own lock. 5909 return this.dir.totalInodes(); 5910 } 5911 5912 @Override // FSNamesystemMBean 5913 @Metric 5914 public long getPendingReplicationBlocks() { 5915 return blockManager.getPendingReplicationBlocksCount(); 5916 } 5917 5918 @Override // FSNamesystemMBean 5919 @Metric 5920 public long getUnderReplicatedBlocks() { 5921 return blockManager.getUnderReplicatedBlocksCount(); 5922 } 5923 5924 /** Returns number of blocks with corrupt replicas */ 5925 @Metric({"CorruptBlocks", "Number of blocks with corrupt replicas"}) 5926 public long getCorruptReplicaBlocks() { 5927 return blockManager.getCorruptReplicaBlocksCount(); 5928 } 5929 5930 @Override // FSNamesystemMBean 5931 @Metric 5932 public long getScheduledReplicationBlocks() { 5933 return blockManager.getScheduledReplicationBlocksCount(); 5934 } 5935 5936 @Override 5937 @Metric 5938 public long getPendingDeletionBlocks() { 5939 return blockManager.getPendingDeletionBlocksCount(); 5940 } 5941 5942 @Override 5943 public long getBlockDeletionStartTime() { 5944 return startTime + blockManager.getStartupDelayBlockDeletionInMs(); 5945 } 5946 5947 @Metric 5948 public long getExcessBlocks() { 5949 return blockManager.getExcessBlocksCount(); 5950 } 5951 5952 // HA-only metric 5953 @Metric 5954 public long getPostponedMisreplicatedBlocks() { 5955 return blockManager.getPostponedMisreplicatedBlocksCount(); 5956 } 5957 5958 // HA-only metric 5959 @Metric 5960 public int getPendingDataNodeMessageCount() { 5961 return blockManager.getPendingDataNodeMessageCount(); 5962 } 5963 5964 // HA-only metric 5965 @Metric 5966 public String getHAState() { 5967 return haContext.getState().toString(); 5968 } 5969 5970 // HA-only metric 5971 @Metric 5972 public long getMillisSinceLastLoadedEdits() { 5973 if (isInStandbyState() && editLogTailer != null) { 5974 return monotonicNow() - editLogTailer.getLastLoadTimeMs(); 5975 } else { 5976 return 0; 5977 } 5978 } 5979 5980 @Metric 5981 public int getBlockCapacity() { 5982 return blockManager.getCapacity(); 5983 } 5984 5985 @Override // FSNamesystemMBean 5986 public String getFSState() { 5987 return isInSafeMode() ? "safeMode" : "Operational"; 5988 } 5989 5990 private ObjectName mbeanName; 5991 private ObjectName mxbeanName; 5992 5993 /** 5994 * Register the FSNamesystem MBean using the name 5995 * "hadoop:service=NameNode,name=FSNamesystemState" 5996 */ 5997 private void registerMBean() { 5998 // We can only implement one MXBean interface, so we keep the old one. 5999 try { 6000 StandardMBean bean = new StandardMBean(this, FSNamesystemMBean.class); 6001 mbeanName = MBeans.register("NameNode", "FSNamesystemState", bean); 6002 } catch (NotCompliantMBeanException e) { 6003 throw new RuntimeException("Bad MBean setup", e); 6004 } 6005 6006 LOG.info("Registered FSNamesystemState MBean"); 6007 } 6008 6009 /** 6010 * shutdown FSNamesystem 6011 */ 6012 void shutdown() { 6013 if (snapshotManager != null) { 6014 snapshotManager.shutdown(); 6015 } 6016 if (mbeanName != null) { 6017 MBeans.unregister(mbeanName); 6018 mbeanName = null; 6019 } 6020 if (mxbeanName != null) { 6021 MBeans.unregister(mxbeanName); 6022 mxbeanName = null; 6023 } 6024 if (dir != null) { 6025 dir.shutdown(); 6026 } 6027 if (blockManager != null) { 6028 blockManager.shutdown(); 6029 } 6030 } 6031 6032 @Override // FSNamesystemMBean 6033 public int getNumLiveDataNodes() { 6034 return getBlockManager().getDatanodeManager().getNumLiveDataNodes(); 6035 } 6036 6037 @Override // FSNamesystemMBean 6038 public int getNumDeadDataNodes() { 6039 return getBlockManager().getDatanodeManager().getNumDeadDataNodes(); 6040 } 6041 6042 @Override // FSNamesystemMBean 6043 public int getNumDecomLiveDataNodes() { 6044 final List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>(); 6045 getBlockManager().getDatanodeManager().fetchDatanodes(live, null, false); 6046 int liveDecommissioned = 0; 6047 for (DatanodeDescriptor node : live) { 6048 liveDecommissioned += node.isDecommissioned() ? 1 : 0; 6049 } 6050 return liveDecommissioned; 6051 } 6052 6053 @Override // FSNamesystemMBean 6054 public int getNumDecomDeadDataNodes() { 6055 final List<DatanodeDescriptor> dead = new ArrayList<DatanodeDescriptor>(); 6056 getBlockManager().getDatanodeManager().fetchDatanodes(null, dead, false); 6057 int deadDecommissioned = 0; 6058 for (DatanodeDescriptor node : dead) { 6059 deadDecommissioned += node.isDecommissioned() ? 1 : 0; 6060 } 6061 return deadDecommissioned; 6062 } 6063 6064 @Override // FSNamesystemMBean 6065 public int getVolumeFailuresTotal() { 6066 List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>(); 6067 getBlockManager().getDatanodeManager().fetchDatanodes(live, null, false); 6068 int volumeFailuresTotal = 0; 6069 for (DatanodeDescriptor node: live) { 6070 volumeFailuresTotal += node.getVolumeFailures(); 6071 } 6072 return volumeFailuresTotal; 6073 } 6074 6075 @Override // FSNamesystemMBean 6076 public long getEstimatedCapacityLostTotal() { 6077 List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>(); 6078 getBlockManager().getDatanodeManager().fetchDatanodes(live, null, false); 6079 long estimatedCapacityLostTotal = 0; 6080 for (DatanodeDescriptor node: live) { 6081 VolumeFailureSummary volumeFailureSummary = node.getVolumeFailureSummary(); 6082 if (volumeFailureSummary != null) { 6083 estimatedCapacityLostTotal += 6084 volumeFailureSummary.getEstimatedCapacityLostTotal(); 6085 } 6086 } 6087 return estimatedCapacityLostTotal; 6088 } 6089 6090 @Override // FSNamesystemMBean 6091 public int getNumDecommissioningDataNodes() { 6092 return getBlockManager().getDatanodeManager().getDecommissioningNodes() 6093 .size(); 6094 } 6095 6096 @Override // FSNamesystemMBean 6097 @Metric({"StaleDataNodes", 6098 "Number of datanodes marked stale due to delayed heartbeat"}) 6099 public int getNumStaleDataNodes() { 6100 return getBlockManager().getDatanodeManager().getNumStaleNodes(); 6101 } 6102 6103 /** 6104 * Storages are marked as "content stale" after NN restart or fails over and 6105 * before NN receives the first Heartbeat followed by the first Blockreport. 6106 */ 6107 @Override // FSNamesystemMBean 6108 public int getNumStaleStorages() { 6109 return getBlockManager().getDatanodeManager().getNumStaleStorages(); 6110 } 6111 6112 @Override // FSNamesystemMBean 6113 public String getTopUserOpCounts() { 6114 if (!topConf.isEnabled) { 6115 return null; 6116 } 6117 6118 Date now = new Date(); 6119 final List<RollingWindowManager.TopWindow> topWindows = 6120 topMetrics.getTopWindows(); 6121 Map<String, Object> topMap = new TreeMap<String, Object>(); 6122 topMap.put("windows", topWindows); 6123 topMap.put("timestamp", DFSUtil.dateToIso8601String(now)); 6124 ObjectMapper mapper = new ObjectMapper(); 6125 try { 6126 return mapper.writeValueAsString(topMap); 6127 } catch (IOException e) { 6128 LOG.warn("Failed to fetch TopUser metrics", e); 6129 } 6130 return null; 6131 } 6132 6133 /** 6134 * Increments, logs and then returns the stamp 6135 */ 6136 long nextGenerationStamp(boolean legacyBlock) 6137 throws IOException, SafeModeException { 6138 assert hasWriteLock(); 6139 checkNameNodeSafeMode("Cannot get next generation stamp"); 6140 6141 long gs = blockIdManager.nextGenerationStamp(legacyBlock); 6142 if (legacyBlock) { 6143 getEditLog().logGenerationStampV1(gs); 6144 } else { 6145 getEditLog().logGenerationStampV2(gs); 6146 } 6147 6148 // NB: callers sync the log 6149 return gs; 6150 } 6151 6152 /** 6153 * Increments, logs and then returns the block ID 6154 */ 6155 private long nextBlockId() throws IOException { 6156 assert hasWriteLock(); 6157 checkNameNodeSafeMode("Cannot get next block ID"); 6158 final long blockId = blockIdManager.nextBlockId(); 6159 getEditLog().logAllocateBlockId(blockId); 6160 // NB: callers sync the log 6161 return blockId; 6162 } 6163 6164 private boolean isFileDeleted(INodeFile file) { 6165 // Not in the inodeMap or in the snapshot but marked deleted. 6166 if (dir.getInode(file.getId()) == null) { 6167 return true; 6168 } 6169 6170 // look at the path hierarchy to see if one parent is deleted by recursive 6171 // deletion 6172 INode tmpChild = file; 6173 INodeDirectory tmpParent = file.getParent(); 6174 while (true) { 6175 if (tmpParent == null) { 6176 return true; 6177 } 6178 6179 INode childINode = tmpParent.getChild(tmpChild.getLocalNameBytes(), 6180 Snapshot.CURRENT_STATE_ID); 6181 if (childINode == null || !childINode.equals(tmpChild)) { 6182 // a newly created INode with the same name as an already deleted one 6183 // would be a different INode than the deleted one 6184 return true; 6185 } 6186 6187 if (tmpParent.isRoot()) { 6188 break; 6189 } 6190 6191 tmpChild = tmpParent; 6192 tmpParent = tmpParent.getParent(); 6193 } 6194 6195 if (file.isWithSnapshot() && 6196 file.getFileWithSnapshotFeature().isCurrentFileDeleted()) { 6197 return true; 6198 } 6199 return false; 6200 } 6201 6202 private INodeFile checkUCBlock(ExtendedBlock block, 6203 String clientName) throws IOException { 6204 assert hasWriteLock(); 6205 checkNameNodeSafeMode("Cannot get a new generation stamp and an " 6206 + "access token for block " + block); 6207 6208 // check stored block state 6209 BlockInfoContiguous storedBlock = getStoredBlock(ExtendedBlock.getLocalBlock(block)); 6210 if (storedBlock == null || 6211 storedBlock.getBlockUCState() != BlockUCState.UNDER_CONSTRUCTION) { 6212 throw new IOException(block + 6213 " does not exist or is not under Construction" + storedBlock); 6214 } 6215 6216 // check file inode 6217 final INodeFile file = ((INode)storedBlock.getBlockCollection()).asFile(); 6218 if (file == null || !file.isUnderConstruction() || isFileDeleted(file)) { 6219 throw new IOException("The file " + storedBlock + 6220 " belonged to does not exist or it is not under construction."); 6221 } 6222 6223 // check lease 6224 if (clientName == null 6225 || !clientName.equals(file.getFileUnderConstructionFeature() 6226 .getClientName())) { 6227 throw new LeaseExpiredException("Lease mismatch: " + block + 6228 " is accessed by a non lease holder " + clientName); 6229 } 6230 6231 return file; 6232 } 6233 6234 /** 6235 * Client is reporting some bad block locations. 6236 */ 6237 void reportBadBlocks(LocatedBlock[] blocks) throws IOException { 6238 checkOperation(OperationCategory.WRITE); 6239 writeLock(); 6240 try { 6241 checkOperation(OperationCategory.WRITE); 6242 for (int i = 0; i < blocks.length; i++) { 6243 ExtendedBlock blk = blocks[i].getBlock(); 6244 DatanodeInfo[] nodes = blocks[i].getLocations(); 6245 String[] storageIDs = blocks[i].getStorageIDs(); 6246 for (int j = 0; j < nodes.length; j++) { 6247 NameNode.stateChangeLog.info("*DIR* reportBadBlocks for block: {} on" 6248 + " datanode: {}", blk, nodes[j].getXferAddr()); 6249 blockManager.findAndMarkBlockAsCorrupt(blk, nodes[j], 6250 storageIDs == null ? null: storageIDs[j], 6251 "client machine reported it"); 6252 } 6253 } 6254 } finally { 6255 writeUnlock("reportBadBlocks"); 6256 } 6257 } 6258 6259 /** 6260 * Get a new generation stamp together with an access token for 6261 * a block under construction 6262 * 6263 * This method is called for recovering a failed pipeline or setting up 6264 * a pipeline to append to a block. 6265 * 6266 * @param block a block 6267 * @param clientName the name of a client 6268 * @return a located block with a new generation stamp and an access token 6269 * @throws IOException if any error occurs 6270 */ 6271 LocatedBlock updateBlockForPipeline(ExtendedBlock block, 6272 String clientName) throws IOException { 6273 LocatedBlock locatedBlock; 6274 checkOperation(OperationCategory.WRITE); 6275 writeLock(); 6276 try { 6277 checkOperation(OperationCategory.WRITE); 6278 6279 // check vadility of parameters 6280 checkUCBlock(block, clientName); 6281 6282 // get a new generation stamp and an access token 6283 block.setGenerationStamp(nextGenerationStamp(blockIdManager.isLegacyBlock(block.getLocalBlock()))); 6284 locatedBlock = new LocatedBlock(block, new DatanodeInfo[0]); 6285 blockManager.setBlockToken(locatedBlock, AccessMode.WRITE); 6286 } finally { 6287 writeUnlock("bumpBlockGenerationStamp"); 6288 } 6289 // Ensure we record the new generation stamp 6290 getEditLog().logSync(); 6291 return locatedBlock; 6292 } 6293 6294 /** 6295 * Update a pipeline for a block under construction 6296 * 6297 * @param clientName the name of the client 6298 * @param oldBlock and old block 6299 * @param newBlock a new block with a new generation stamp and length 6300 * @param newNodes datanodes in the pipeline 6301 * @throws IOException if any error occurs 6302 */ 6303 void updatePipeline( 6304 String clientName, ExtendedBlock oldBlock, ExtendedBlock newBlock, 6305 DatanodeID[] newNodes, String[] newStorageIDs, boolean logRetryCache) 6306 throws IOException { 6307 LOG.info("updatePipeline(" + oldBlock.getLocalBlock() 6308 + ", newGS=" + newBlock.getGenerationStamp() 6309 + ", newLength=" + newBlock.getNumBytes() 6310 + ", newNodes=" + Arrays.asList(newNodes) 6311 + ", client=" + clientName 6312 + ")"); 6313 waitForLoadingFSImage(); 6314 writeLock(); 6315 try { 6316 checkOperation(OperationCategory.WRITE); 6317 checkNameNodeSafeMode("Pipeline not updated"); 6318 assert newBlock.getBlockId()==oldBlock.getBlockId() : newBlock + " and " 6319 + oldBlock + " has different block identifier"; 6320 updatePipelineInternal(clientName, oldBlock, newBlock, newNodes, 6321 newStorageIDs, logRetryCache); 6322 } finally { 6323 writeUnlock("updatePipeline"); 6324 } 6325 getEditLog().logSync(); 6326 LOG.info("updatePipeline(" + oldBlock.getLocalBlock() + " => " 6327 + newBlock.getLocalBlock() + ") success"); 6328 } 6329 6330 private void updatePipelineInternal(String clientName, ExtendedBlock oldBlock, 6331 ExtendedBlock newBlock, DatanodeID[] newNodes, String[] newStorageIDs, 6332 boolean logRetryCache) 6333 throws IOException { 6334 assert hasWriteLock(); 6335 // check the vadility of the block and lease holder name 6336 final INodeFile pendingFile = checkUCBlock(oldBlock, clientName); 6337 final String src = pendingFile.getFullPathName(); 6338 final BlockInfoContiguousUnderConstruction blockinfo 6339 = (BlockInfoContiguousUnderConstruction)pendingFile.getLastBlock(); 6340 6341 // check new GS & length: this is not expected 6342 if (newBlock.getGenerationStamp() <= blockinfo.getGenerationStamp() || 6343 newBlock.getNumBytes() < blockinfo.getNumBytes()) { 6344 String msg = "Update " + oldBlock + " (len = " + 6345 blockinfo.getNumBytes() + ") to an older state: " + newBlock + 6346 " (len = " + newBlock.getNumBytes() +")"; 6347 LOG.warn(msg); 6348 throw new IOException(msg); 6349 } 6350 6351 // Update old block with the new generation stamp and new length 6352 blockManager.updateLastBlock(blockinfo, newBlock); 6353 6354 // find the DatanodeDescriptor objects 6355 final DatanodeStorageInfo[] storages = blockManager.getDatanodeManager() 6356 .getDatanodeStorageInfos(newNodes, newStorageIDs, 6357 "src=%s, oldBlock=%s, newBlock=%s, clientName=%s", 6358 src, oldBlock, newBlock, clientName); 6359 blockinfo.setExpectedLocations(storages); 6360 6361 persistBlocks(src, pendingFile, logRetryCache); 6362 } 6363 6364 // rename was successful. If any part of the renamed subtree had 6365 // files that were being written to, update with new filename. 6366 void unprotectedChangeLease(String src, String dst) { 6367 assert hasWriteLock(); 6368 leaseManager.changeLease(src, dst); 6369 } 6370 6371 /** 6372 * Serializes leases. 6373 */ 6374 void saveFilesUnderConstruction(DataOutputStream out, 6375 Map<Long, INodeFile> snapshotUCMap) throws IOException { 6376 // This is run by an inferior thread of saveNamespace, which holds a read 6377 // lock on our behalf. If we took the read lock here, we could block 6378 // for fairness if a writer is waiting on the lock. 6379 synchronized (leaseManager) { 6380 Map<String, INodeFile> nodes = leaseManager.getINodesUnderConstruction(); 6381 for (Map.Entry<String, INodeFile> entry : nodes.entrySet()) { 6382 // TODO: for HDFS-5428, because of rename operations, some 6383 // under-construction files that are 6384 // in the current fs directory can also be captured in the 6385 // snapshotUCMap. We should remove them from the snapshotUCMap. 6386 snapshotUCMap.remove(entry.getValue().getId()); 6387 } 6388 6389 out.writeInt(nodes.size() + snapshotUCMap.size()); // write the size 6390 for (Map.Entry<String, INodeFile> entry : nodes.entrySet()) { 6391 FSImageSerialization.writeINodeUnderConstruction( 6392 out, entry.getValue(), entry.getKey()); 6393 } 6394 for (Map.Entry<Long, INodeFile> entry : snapshotUCMap.entrySet()) { 6395 // for those snapshot INodeFileUC, we use "/.reserved/.inodes/<inodeid>" 6396 // as their paths 6397 StringBuilder b = new StringBuilder(); 6398 b.append(FSDirectory.DOT_RESERVED_PATH_PREFIX) 6399 .append(Path.SEPARATOR).append(FSDirectory.DOT_INODES_STRING) 6400 .append(Path.SEPARATOR).append(entry.getValue().getId()); 6401 FSImageSerialization.writeINodeUnderConstruction( 6402 out, entry.getValue(), b.toString()); 6403 } 6404 } 6405 } 6406 6407 /** 6408 * @return all the under-construction files in the lease map 6409 */ 6410 Map<String, INodeFile> getFilesUnderConstruction() { 6411 synchronized (leaseManager) { 6412 return leaseManager.getINodesUnderConstruction(); 6413 } 6414 } 6415 6416 /** 6417 * Register a Backup name-node, verifying that it belongs 6418 * to the correct namespace, and adding it to the set of 6419 * active journals if necessary. 6420 * 6421 * @param bnReg registration of the new BackupNode 6422 * @param nnReg registration of this NameNode 6423 * @throws IOException if the namespace IDs do not match 6424 */ 6425 void registerBackupNode(NamenodeRegistration bnReg, 6426 NamenodeRegistration nnReg) throws IOException { 6427 writeLock(); 6428 try { 6429 if(getFSImage().getStorage().getNamespaceID() 6430 != bnReg.getNamespaceID()) 6431 throw new IOException("Incompatible namespaceIDs: " 6432 + " Namenode namespaceID = " 6433 + getFSImage().getStorage().getNamespaceID() + "; " 6434 + bnReg.getRole() + 6435 " node namespaceID = " + bnReg.getNamespaceID()); 6436 if (bnReg.getRole() == NamenodeRole.BACKUP) { 6437 getFSImage().getEditLog().registerBackupNode( 6438 bnReg, nnReg); 6439 } 6440 } finally { 6441 writeUnlock("registerBackupNode"); 6442 } 6443 } 6444 6445 /** 6446 * Release (unregister) backup node. 6447 * <p> 6448 * Find and remove the backup stream corresponding to the node. 6449 * @throws IOException 6450 */ 6451 void releaseBackupNode(NamenodeRegistration registration) 6452 throws IOException { 6453 checkOperation(OperationCategory.WRITE); 6454 writeLock(); 6455 try { 6456 checkOperation(OperationCategory.WRITE); 6457 if(getFSImage().getStorage().getNamespaceID() 6458 != registration.getNamespaceID()) 6459 throw new IOException("Incompatible namespaceIDs: " 6460 + " Namenode namespaceID = " 6461 + getFSImage().getStorage().getNamespaceID() + "; " 6462 + registration.getRole() + 6463 " node namespaceID = " + registration.getNamespaceID()); 6464 getEditLog().releaseBackupStream(registration); 6465 } finally { 6466 writeUnlock("releaseBackupNode"); 6467 } 6468 } 6469 6470 static class CorruptFileBlockInfo { 6471 final String path; 6472 final Block block; 6473 6474 public CorruptFileBlockInfo(String p, Block b) { 6475 path = p; 6476 block = b; 6477 } 6478 6479 @Override 6480 public String toString() { 6481 return block.getBlockName() + "\t" + path; 6482 } 6483 } 6484 /** 6485 * @param path Restrict corrupt files to this portion of namespace. 6486 * @param cookieTab Support for continuation; cookieTab tells where 6487 * to start from 6488 * @return a list in which each entry describes a corrupt file/block 6489 * @throws IOException 6490 */ 6491 Collection<CorruptFileBlockInfo> listCorruptFileBlocks(String path, 6492 String[] cookieTab) throws IOException { 6493 checkSuperuserPrivilege(); 6494 checkOperation(OperationCategory.READ); 6495 6496 int count = 0; 6497 ArrayList<CorruptFileBlockInfo> corruptFiles = 6498 new ArrayList<CorruptFileBlockInfo>(); 6499 if (cookieTab == null) { 6500 cookieTab = new String[] { null }; 6501 } 6502 6503 // Do a quick check if there are any corrupt files without taking the lock 6504 if (blockManager.getMissingBlocksCount() == 0) { 6505 if (cookieTab[0] == null) { 6506 cookieTab[0] = String.valueOf(getIntCookie(cookieTab[0])); 6507 } 6508 if (LOG.isDebugEnabled()) { 6509 LOG.debug("there are no corrupt file blocks."); 6510 } 6511 return corruptFiles; 6512 } 6513 6514 readLock(); 6515 try { 6516 checkOperation(OperationCategory.READ); 6517 if (!isPopulatingReplQueues()) { 6518 throw new IOException("Cannot run listCorruptFileBlocks because " + 6519 "replication queues have not been initialized."); 6520 } 6521 // print a limited # of corrupt files per call 6522 6523 final Iterator<Block> blkIterator = blockManager.getCorruptReplicaBlockIterator(); 6524 6525 int skip = getIntCookie(cookieTab[0]); 6526 for (int i = 0; i < skip && blkIterator.hasNext(); i++) { 6527 blkIterator.next(); 6528 } 6529 6530 while (blkIterator.hasNext()) { 6531 Block blk = blkIterator.next(); 6532 final INode inode = (INode)blockManager.getBlockCollection(blk); 6533 skip++; 6534 if (inode != null && blockManager.countNodes(blk).liveReplicas() == 0) { 6535 String src = inode.getFullPathName(); 6536 if (src.startsWith(path)){ 6537 corruptFiles.add(new CorruptFileBlockInfo(src, blk)); 6538 count++; 6539 if (count >= DEFAULT_MAX_CORRUPT_FILEBLOCKS_RETURNED) 6540 break; 6541 } 6542 } 6543 } 6544 cookieTab[0] = String.valueOf(skip); 6545 if (LOG.isDebugEnabled()) { 6546 LOG.debug("list corrupt file blocks returned: " + count); 6547 } 6548 return corruptFiles; 6549 } finally { 6550 readUnlock("listCorruptFileBlocks"); 6551 } 6552 } 6553 6554 /** 6555 * Convert string cookie to integer. 6556 */ 6557 private static int getIntCookie(String cookie){ 6558 int c; 6559 if(cookie == null){ 6560 c = 0; 6561 } else { 6562 try{ 6563 c = Integer.parseInt(cookie); 6564 }catch (NumberFormatException e) { 6565 c = 0; 6566 } 6567 } 6568 c = Math.max(0, c); 6569 return c; 6570 } 6571 6572 /** 6573 * Create delegation token secret manager 6574 */ 6575 private DelegationTokenSecretManager createDelegationTokenSecretManager( 6576 Configuration conf) { 6577 return new DelegationTokenSecretManager(conf.getLong( 6578 DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_KEY, 6579 DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_DEFAULT), 6580 conf.getLong(DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_KEY, 6581 DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_DEFAULT), 6582 conf.getLong(DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_KEY, 6583 DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_DEFAULT), 6584 DELEGATION_TOKEN_REMOVER_SCAN_INTERVAL, 6585 conf.getBoolean(DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_KEY, 6586 DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_DEFAULT), 6587 this); 6588 } 6589 6590 /** 6591 * Returns the DelegationTokenSecretManager instance in the namesystem. 6592 * @return delegation token secret manager object 6593 */ 6594 DelegationTokenSecretManager getDelegationTokenSecretManager() { 6595 return dtSecretManager; 6596 } 6597 6598 /** 6599 * @param renewer Renewer information 6600 * @return delegation toek 6601 * @throws IOException on error 6602 */ 6603 Token<DelegationTokenIdentifier> getDelegationToken(Text renewer) 6604 throws IOException { 6605 Token<DelegationTokenIdentifier> token; 6606 checkOperation(OperationCategory.WRITE); 6607 writeLock(); 6608 try { 6609 checkOperation(OperationCategory.WRITE); 6610 checkNameNodeSafeMode("Cannot issue delegation token"); 6611 if (!isAllowedDelegationTokenOp()) { 6612 throw new IOException( 6613 "Delegation Token can be issued only with kerberos or web authentication"); 6614 } 6615 if (dtSecretManager == null || !dtSecretManager.isRunning()) { 6616 LOG.warn("trying to get DT with no secret manager running"); 6617 return null; 6618 } 6619 6620 UserGroupInformation ugi = getRemoteUser(); 6621 String user = ugi.getUserName(); 6622 Text owner = new Text(user); 6623 Text realUser = null; 6624 if (ugi.getRealUser() != null) { 6625 realUser = new Text(ugi.getRealUser().getUserName()); 6626 } 6627 DelegationTokenIdentifier dtId = new DelegationTokenIdentifier(owner, 6628 renewer, realUser); 6629 token = new Token<DelegationTokenIdentifier>( 6630 dtId, dtSecretManager); 6631 long expiryTime = dtSecretManager.getTokenExpiryTime(dtId); 6632 getEditLog().logGetDelegationToken(dtId, expiryTime); 6633 } finally { 6634 writeUnlock("getDelegationToken"); 6635 } 6636 getEditLog().logSync(); 6637 return token; 6638 } 6639 6640 /** 6641 * 6642 * @param token token to renew 6643 * @return new expiryTime of the token 6644 * @throws InvalidToken if {@code token} is invalid 6645 * @throws IOException on other errors 6646 */ 6647 long renewDelegationToken(Token<DelegationTokenIdentifier> token) 6648 throws InvalidToken, IOException { 6649 long expiryTime; 6650 checkOperation(OperationCategory.WRITE); 6651 writeLock(); 6652 try { 6653 checkOperation(OperationCategory.WRITE); 6654 6655 checkNameNodeSafeMode("Cannot renew delegation token"); 6656 if (!isAllowedDelegationTokenOp()) { 6657 throw new IOException( 6658 "Delegation Token can be renewed only with kerberos or web authentication"); 6659 } 6660 String renewer = getRemoteUser().getShortUserName(); 6661 expiryTime = dtSecretManager.renewToken(token, renewer); 6662 DelegationTokenIdentifier id = new DelegationTokenIdentifier(); 6663 ByteArrayInputStream buf = new ByteArrayInputStream(token.getIdentifier()); 6664 DataInputStream in = new DataInputStream(buf); 6665 id.readFields(in); 6666 getEditLog().logRenewDelegationToken(id, expiryTime); 6667 } finally { 6668 writeUnlock("renewDelegationToken"); 6669 } 6670 getEditLog().logSync(); 6671 return expiryTime; 6672 } 6673 6674 /** 6675 * 6676 * @param token token to cancel 6677 * @throws IOException on error 6678 */ 6679 void cancelDelegationToken(Token<DelegationTokenIdentifier> token) 6680 throws IOException { 6681 checkOperation(OperationCategory.WRITE); 6682 writeLock(); 6683 try { 6684 checkOperation(OperationCategory.WRITE); 6685 6686 checkNameNodeSafeMode("Cannot cancel delegation token"); 6687 String canceller = getRemoteUser().getUserName(); 6688 DelegationTokenIdentifier id = dtSecretManager 6689 .cancelToken(token, canceller); 6690 getEditLog().logCancelDelegationToken(id); 6691 } finally { 6692 writeUnlock("cancelDelegationToken"); 6693 } 6694 getEditLog().logSync(); 6695 } 6696 6697 /** 6698 * @param out save state of the secret manager 6699 * @param sdPath String storage directory path 6700 */ 6701 void saveSecretManagerStateCompat(DataOutputStream out, String sdPath) 6702 throws IOException { 6703 dtSecretManager.saveSecretManagerStateCompat(out, sdPath); 6704 } 6705 6706 SecretManagerState saveSecretManagerState() { 6707 return dtSecretManager.saveSecretManagerState(); 6708 } 6709 6710 /** 6711 * @param in load the state of secret manager from input stream 6712 */ 6713 void loadSecretManagerStateCompat(DataInput in) throws IOException { 6714 dtSecretManager.loadSecretManagerStateCompat(in); 6715 } 6716 6717 void loadSecretManagerState(SecretManagerSection s, 6718 List<SecretManagerSection.DelegationKey> keys, 6719 List<SecretManagerSection.PersistToken> tokens) throws IOException { 6720 dtSecretManager.loadSecretManagerState(new SecretManagerState(s, keys, tokens)); 6721 } 6722 6723 /** 6724 * Log the updateMasterKey operation to edit logs 6725 * 6726 * @param key new delegation key. 6727 */ 6728 public void logUpdateMasterKey(DelegationKey key) { 6729 6730 assert !isInSafeMode() : 6731 "this should never be called while in safemode, since we stop " + 6732 "the DT manager before entering safemode!"; 6733 // No need to hold FSN lock since we don't access any internal 6734 // structures, and this is stopped before the FSN shuts itself 6735 // down, etc. 6736 getEditLog().logUpdateMasterKey(key); 6737 getEditLog().logSync(); 6738 } 6739 6740 /** 6741 * Log the cancellation of expired tokens to edit logs 6742 * 6743 * @param id token identifier to cancel 6744 */ 6745 public void logExpireDelegationToken(DelegationTokenIdentifier id) { 6746 assert !isInSafeMode() : 6747 "this should never be called while in safemode, since we stop " + 6748 "the DT manager before entering safemode!"; 6749 // No need to hold FSN lock since we don't access any internal 6750 // structures, and this is stopped before the FSN shuts itself 6751 // down, etc. 6752 getEditLog().logCancelDelegationToken(id); 6753 } 6754 6755 private void logReassignLease(String leaseHolder, String src, 6756 String newHolder) { 6757 assert hasWriteLock(); 6758 getEditLog().logReassignLease(leaseHolder, src, newHolder); 6759 } 6760 6761 /** 6762 * 6763 * @return true if delegation token operation is allowed 6764 */ 6765 private boolean isAllowedDelegationTokenOp() throws IOException { 6766 AuthenticationMethod authMethod = getConnectionAuthenticationMethod(); 6767 if (UserGroupInformation.isSecurityEnabled() 6768 && (authMethod != AuthenticationMethod.KERBEROS) 6769 && (authMethod != AuthenticationMethod.KERBEROS_SSL) 6770 && (authMethod != AuthenticationMethod.CERTIFICATE)) { 6771 return false; 6772 } 6773 return true; 6774 } 6775 6776 /** 6777 * Returns authentication method used to establish the connection 6778 * @return AuthenticationMethod used to establish connection 6779 * @throws IOException 6780 */ 6781 private AuthenticationMethod getConnectionAuthenticationMethod() 6782 throws IOException { 6783 UserGroupInformation ugi = getRemoteUser(); 6784 AuthenticationMethod authMethod = ugi.getAuthenticationMethod(); 6785 if (authMethod == AuthenticationMethod.PROXY) { 6786 authMethod = ugi.getRealUser().getAuthenticationMethod(); 6787 } 6788 return authMethod; 6789 } 6790 6791 /** 6792 * Client invoked methods are invoked over RPC and will be in 6793 * RPC call context even if the client exits. 6794 */ 6795 boolean isExternalInvocation() { 6796 return Server.isRpcInvocation() || NamenodeWebHdfsMethods.isWebHdfsInvocation(); 6797 } 6798 6799 private static InetAddress getRemoteIp() { 6800 InetAddress ip = Server.getRemoteIp(); 6801 if (ip != null) { 6802 return ip; 6803 } 6804 return NamenodeWebHdfsMethods.getRemoteIp(); 6805 } 6806 6807 // optimize ugi lookup for RPC operations to avoid a trip through 6808 // UGI.getCurrentUser which is synch'ed 6809 private static UserGroupInformation getRemoteUser() throws IOException { 6810 return NameNode.getRemoteUser(); 6811 } 6812 6813 /** 6814 * Log fsck event in the audit log 6815 */ 6816 void logFsckEvent(String src, InetAddress remoteAddress) throws IOException { 6817 if (isAuditEnabled()) { 6818 logAuditEvent(true, getRemoteUser(), 6819 remoteAddress, 6820 "fsck", src, null, null); 6821 } 6822 } 6823 /** 6824 * Register NameNodeMXBean 6825 */ 6826 private void registerMXBean() { 6827 mxbeanName = MBeans.register("NameNode", "NameNodeInfo", this); 6828 } 6829 6830 /** 6831 * Class representing Namenode information for JMX interfaces 6832 */ 6833 @Override // NameNodeMXBean 6834 public String getVersion() { 6835 return VersionInfo.getVersion() + ", r" + VersionInfo.getRevision(); 6836 } 6837 6838 @Override // NameNodeMXBean 6839 public long getUsed() { 6840 return this.getCapacityUsed(); 6841 } 6842 6843 @Override // NameNodeMXBean 6844 public long getFree() { 6845 return this.getCapacityRemaining(); 6846 } 6847 6848 @Override // NameNodeMXBean 6849 public long getTotal() { 6850 return this.getCapacityTotal(); 6851 } 6852 6853 @Override // NameNodeMXBean 6854 public String getSafemode() { 6855 if (!this.isInSafeMode()) 6856 return ""; 6857 return "Safe mode is ON. " + this.getSafeModeTip(); 6858 } 6859 6860 @Override // NameNodeMXBean 6861 public boolean isUpgradeFinalized() { 6862 return this.getFSImage().isUpgradeFinalized(); 6863 } 6864 6865 @Override // NameNodeMXBean 6866 public long getNonDfsUsedSpace() { 6867 return datanodeStatistics.getCapacityUsedNonDFS(); 6868 } 6869 6870 @Override // NameNodeMXBean 6871 public float getPercentUsed() { 6872 return datanodeStatistics.getCapacityUsedPercent(); 6873 } 6874 6875 @Override // NameNodeMXBean 6876 public long getBlockPoolUsedSpace() { 6877 return datanodeStatistics.getBlockPoolUsed(); 6878 } 6879 6880 @Override // NameNodeMXBean 6881 public float getPercentBlockPoolUsed() { 6882 return datanodeStatistics.getPercentBlockPoolUsed(); 6883 } 6884 6885 @Override // NameNodeMXBean 6886 public float getPercentRemaining() { 6887 return datanodeStatistics.getCapacityRemainingPercent(); 6888 } 6889 6890 @Override // NameNodeMXBean 6891 public long getCacheCapacity() { 6892 return datanodeStatistics.getCacheCapacity(); 6893 } 6894 6895 @Override // NameNodeMXBean 6896 public long getCacheUsed() { 6897 return datanodeStatistics.getCacheUsed(); 6898 } 6899 6900 @Override // NameNodeMXBean 6901 public long getTotalBlocks() { 6902 return getBlocksTotal(); 6903 } 6904 6905 @Override // NameNodeMXBean 6906 @Metric 6907 public long getTotalFiles() { 6908 return getFilesTotal(); 6909 } 6910 6911 @Override // NameNodeMXBean 6912 public long getNumberOfMissingBlocks() { 6913 return getMissingBlocksCount(); 6914 } 6915 6916 @Override // NameNodeMXBean 6917 public long getNumberOfMissingBlocksWithReplicationFactorOne() { 6918 return getMissingReplOneBlocksCount(); 6919 } 6920 6921 @Override // NameNodeMXBean 6922 public int getThreads() { 6923 return ManagementFactory.getThreadMXBean().getThreadCount(); 6924 } 6925 6926 /** 6927 * Returned information is a JSON representation of map with host name as the 6928 * key and value is a map of live node attribute keys to its values 6929 */ 6930 @Override // NameNodeMXBean 6931 public String getLiveNodes() { 6932 final Map<String, Map<String,Object>> info = 6933 new HashMap<String, Map<String,Object>>(); 6934 final List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>(); 6935 blockManager.getDatanodeManager().fetchDatanodes(live, null, false); 6936 for (DatanodeDescriptor node : live) { 6937 ImmutableMap.Builder<String, Object> innerinfo = 6938 ImmutableMap.<String,Object>builder(); 6939 innerinfo 6940 .put("infoAddr", node.getInfoAddr()) 6941 .put("infoSecureAddr", node.getInfoSecureAddr()) 6942 .put("xferaddr", node.getXferAddr()) 6943 .put("lastContact", getLastContact(node)) 6944 .put("usedSpace", getDfsUsed(node)) 6945 .put("adminState", node.getAdminState().toString()) 6946 .put("nonDfsUsedSpace", node.getNonDfsUsed()) 6947 .put("capacity", node.getCapacity()) 6948 .put("numBlocks", node.numBlocks()) 6949 .put("version", node.getSoftwareVersion()) 6950 .put("used", node.getDfsUsed()) 6951 .put("remaining", node.getRemaining()) 6952 .put("blockScheduled", node.getBlocksScheduled()) 6953 .put("blockPoolUsed", node.getBlockPoolUsed()) 6954 .put("blockPoolUsedPercent", node.getBlockPoolUsedPercent()) 6955 .put("volfails", node.getVolumeFailures()); 6956 VolumeFailureSummary volumeFailureSummary = node.getVolumeFailureSummary(); 6957 if (volumeFailureSummary != null) { 6958 innerinfo 6959 .put("failedStorageLocations", 6960 volumeFailureSummary.getFailedStorageLocations()) 6961 .put("lastVolumeFailureDate", 6962 volumeFailureSummary.getLastVolumeFailureDate()) 6963 .put("estimatedCapacityLostTotal", 6964 volumeFailureSummary.getEstimatedCapacityLostTotal()); 6965 } 6966 info.put(node.getHostName() + ":" + node.getXferPort(), innerinfo.build()); 6967 } 6968 return JSON.toString(info); 6969 } 6970 6971 /** 6972 * Returned information is a JSON representation of map with host name as the 6973 * key and value is a map of dead node attribute keys to its values 6974 */ 6975 @Override // NameNodeMXBean 6976 public String getDeadNodes() { 6977 final Map<String, Map<String, Object>> info = 6978 new HashMap<String, Map<String, Object>>(); 6979 final List<DatanodeDescriptor> dead = new ArrayList<DatanodeDescriptor>(); 6980 blockManager.getDatanodeManager().fetchDatanodes(null, dead, false); 6981 for (DatanodeDescriptor node : dead) { 6982 Map<String, Object> innerinfo = ImmutableMap.<String, Object>builder() 6983 .put("lastContact", getLastContact(node)) 6984 .put("decommissioned", node.isDecommissioned()) 6985 .put("xferaddr", node.getXferAddr()) 6986 .build(); 6987 info.put(node.getHostName() + ":" + node.getXferPort(), innerinfo); 6988 } 6989 return JSON.toString(info); 6990 } 6991 6992 /** 6993 * Returned information is a JSON representation of map with host name as the 6994 * key and value is a map of decommissioning node attribute keys to its 6995 * values 6996 */ 6997 @Override // NameNodeMXBean 6998 public String getDecomNodes() { 6999 final Map<String, Map<String, Object>> info = 7000 new HashMap<String, Map<String, Object>>(); 7001 final List<DatanodeDescriptor> decomNodeList = blockManager.getDatanodeManager( 7002 ).getDecommissioningNodes(); 7003 for (DatanodeDescriptor node : decomNodeList) { 7004 Map<String, Object> innerinfo = ImmutableMap 7005 .<String, Object> builder() 7006 .put("xferaddr", node.getXferAddr()) 7007 .put("underReplicatedBlocks", 7008 node.decommissioningStatus.getUnderReplicatedBlocks()) 7009 .put("decommissionOnlyReplicas", 7010 node.decommissioningStatus.getDecommissionOnlyReplicas()) 7011 .put("underReplicateInOpenFiles", 7012 node.decommissioningStatus.getUnderReplicatedInOpenFiles()) 7013 .build(); 7014 info.put(node.getHostName() + ":" + node.getXferPort(), innerinfo); 7015 } 7016 return JSON.toString(info); 7017 } 7018 7019 private long getLastContact(DatanodeDescriptor alivenode) { 7020 return (monotonicNow() - alivenode.getLastUpdateMonotonic())/1000; 7021 } 7022 7023 private long getDfsUsed(DatanodeDescriptor alivenode) { 7024 return alivenode.getDfsUsed(); 7025 } 7026 7027 @Override // NameNodeMXBean 7028 public String getClusterId() { 7029 return getFSImage().getStorage().getClusterID(); 7030 } 7031 7032 @Override // NameNodeMXBean 7033 public String getBlockPoolId() { 7034 return blockPoolId; 7035 } 7036 7037 @Override // NameNodeMXBean 7038 public String getNameDirStatuses() { 7039 Map<String, Map<File, StorageDirType>> statusMap = 7040 new HashMap<String, Map<File, StorageDirType>>(); 7041 7042 Map<File, StorageDirType> activeDirs = new HashMap<File, StorageDirType>(); 7043 for (Iterator<StorageDirectory> it 7044 = getFSImage().getStorage().dirIterator(); it.hasNext();) { 7045 StorageDirectory st = it.next(); 7046 activeDirs.put(st.getRoot(), st.getStorageDirType()); 7047 } 7048 statusMap.put("active", activeDirs); 7049 7050 List<Storage.StorageDirectory> removedStorageDirs 7051 = getFSImage().getStorage().getRemovedStorageDirs(); 7052 Map<File, StorageDirType> failedDirs = new HashMap<File, StorageDirType>(); 7053 for (StorageDirectory st : removedStorageDirs) { 7054 failedDirs.put(st.getRoot(), st.getStorageDirType()); 7055 } 7056 statusMap.put("failed", failedDirs); 7057 7058 return JSON.toString(statusMap); 7059 } 7060 7061 @Override // NameNodeMXBean 7062 public String getNodeUsage() { 7063 float median = 0; 7064 float max = 0; 7065 float min = 0; 7066 float dev = 0; 7067 7068 final Map<String, Map<String,Object>> info = 7069 new HashMap<String, Map<String,Object>>(); 7070 final List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>(); 7071 blockManager.getDatanodeManager().fetchDatanodes(live, null, true); 7072 7073 if (live.size() > 0) { 7074 float totalDfsUsed = 0; 7075 float[] usages = new float[live.size()]; 7076 int i = 0; 7077 for (DatanodeDescriptor dn : live) { 7078 usages[i++] = dn.getDfsUsedPercent(); 7079 totalDfsUsed += dn.getDfsUsedPercent(); 7080 } 7081 totalDfsUsed /= live.size(); 7082 Arrays.sort(usages); 7083 median = usages[usages.length / 2]; 7084 max = usages[usages.length - 1]; 7085 min = usages[0]; 7086 7087 for (i = 0; i < usages.length; i++) { 7088 dev += (usages[i] - totalDfsUsed) * (usages[i] - totalDfsUsed); 7089 } 7090 dev = (float) Math.sqrt(dev / usages.length); 7091 } 7092 7093 final Map<String, Object> innerInfo = new HashMap<String, Object>(); 7094 innerInfo.put("min", StringUtils.format("%.2f%%", min)); 7095 innerInfo.put("median", StringUtils.format("%.2f%%", median)); 7096 innerInfo.put("max", StringUtils.format("%.2f%%", max)); 7097 innerInfo.put("stdDev", StringUtils.format("%.2f%%", dev)); 7098 info.put("nodeUsage", innerInfo); 7099 7100 return JSON.toString(info); 7101 } 7102 7103 @Override // NameNodeMXBean 7104 public String getNameJournalStatus() { 7105 List<Map<String, String>> jasList = new ArrayList<Map<String, String>>(); 7106 FSEditLog log = getFSImage().getEditLog(); 7107 if (log != null) { 7108 // This flag can be false because we cannot hold a lock of FSEditLog 7109 // for metrics. 7110 boolean openForWrite = log.isOpenForWriteWithoutLock(); 7111 for (JournalAndStream jas : log.getJournals()) { 7112 final Map<String, String> jasMap = new HashMap<String, String>(); 7113 String manager = jas.getManager().toString(); 7114 7115 jasMap.put("required", String.valueOf(jas.isRequired())); 7116 jasMap.put("disabled", String.valueOf(jas.isDisabled())); 7117 jasMap.put("manager", manager); 7118 7119 if (jas.isDisabled()) { 7120 jasMap.put("stream", "Failed"); 7121 } else if (openForWrite) { 7122 EditLogOutputStream elos = jas.getCurrentStream(); 7123 if (elos != null) { 7124 jasMap.put("stream", elos.generateReport()); 7125 } else { 7126 jasMap.put("stream", "not currently writing"); 7127 } 7128 } else { 7129 jasMap.put("stream", "open for read"); 7130 } 7131 jasList.add(jasMap); 7132 } 7133 } 7134 return JSON.toString(jasList); 7135 } 7136 7137 @Override // NameNodeMxBean 7138 public String getJournalTransactionInfo() { 7139 Map<String, String> txnIdMap = new HashMap<String, String>(); 7140 txnIdMap.put("LastAppliedOrWrittenTxId", 7141 Long.toString(this.getFSImage().getLastAppliedOrWrittenTxId())); 7142 txnIdMap.put("MostRecentCheckpointTxId", 7143 Long.toString(this.getFSImage().getMostRecentCheckpointTxId())); 7144 return JSON.toString(txnIdMap); 7145 } 7146 7147 @Override // NameNodeMXBean 7148 public String getNNStarted() { 7149 return getStartTime().toString(); 7150 } 7151 7152 @Override // NameNodeMXBean 7153 public String getCompileInfo() { 7154 return VersionInfo.getDate() + " by " + VersionInfo.getUser() + 7155 " from " + VersionInfo.getBranch(); 7156 } 7157 7158 /** @return the block manager. */ 7159 public BlockManager getBlockManager() { 7160 return blockManager; 7161 } 7162 7163 public BlockIdManager getBlockIdManager() { 7164 return blockIdManager; 7165 } 7166 7167 /** @return the FSDirectory. */ 7168 @Override 7169 public FSDirectory getFSDirectory() { 7170 return dir; 7171 } 7172 /** Set the FSDirectory. */ 7173 @VisibleForTesting 7174 public void setFSDirectory(FSDirectory dir) { 7175 this.dir = dir; 7176 } 7177 /** @return the cache manager. */ 7178 public CacheManager getCacheManager() { 7179 return cacheManager; 7180 } 7181 7182 @Override // NameNodeMXBean 7183 public String getCorruptFiles() { 7184 List<String> list = new ArrayList<String>(); 7185 Collection<FSNamesystem.CorruptFileBlockInfo> corruptFileBlocks; 7186 try { 7187 corruptFileBlocks = listCorruptFileBlocks("/", null); 7188 int corruptFileCount = corruptFileBlocks.size(); 7189 if (corruptFileCount != 0) { 7190 for (FSNamesystem.CorruptFileBlockInfo c : corruptFileBlocks) { 7191 list.add(c.toString()); 7192 } 7193 } 7194 } catch (IOException e) { 7195 LOG.warn("Get corrupt file blocks returned error: " + e.getMessage()); 7196 } 7197 return JSON.toString(list); 7198 } 7199 7200 @Override //NameNodeMXBean 7201 public int getDistinctVersionCount() { 7202 return blockManager.getDatanodeManager().getDatanodesSoftwareVersions() 7203 .size(); 7204 } 7205 7206 @Override //NameNodeMXBean 7207 public Map<String, Integer> getDistinctVersions() { 7208 return blockManager.getDatanodeManager().getDatanodesSoftwareVersions(); 7209 } 7210 7211 @Override //NameNodeMXBean 7212 public String getSoftwareVersion() { 7213 return VersionInfo.getVersion(); 7214 } 7215 7216 /** 7217 * Verifies that the given identifier and password are valid and match. 7218 * @param identifier Token identifier. 7219 * @param password Password in the token. 7220 */ 7221 public synchronized void verifyToken(DelegationTokenIdentifier identifier, 7222 byte[] password) throws InvalidToken, RetriableException { 7223 try { 7224 getDelegationTokenSecretManager().verifyToken(identifier, password); 7225 } catch (InvalidToken it) { 7226 if (inTransitionToActive()) { 7227 throw new RetriableException(it); 7228 } 7229 throw it; 7230 } 7231 } 7232 7233 @Override 7234 public boolean isGenStampInFuture(Block block) { 7235 return blockIdManager.isGenStampInFuture(block); 7236 } 7237 7238 @VisibleForTesting 7239 public EditLogTailer getEditLogTailer() { 7240 return editLogTailer; 7241 } 7242 7243 @VisibleForTesting 7244 public void setEditLogTailerForTests(EditLogTailer tailer) { 7245 this.editLogTailer = tailer; 7246 } 7247 7248 @VisibleForTesting 7249 void setFsLockForTests(ReentrantReadWriteLock lock) { 7250 this.fsLock.coarseLock = lock; 7251 } 7252 7253 @VisibleForTesting 7254 public ReentrantReadWriteLock getFsLockForTests() { 7255 return fsLock.coarseLock; 7256 } 7257 7258 @VisibleForTesting 7259 public ReentrantLock getCpLockForTests() { 7260 return cpLock; 7261 } 7262 7263 @VisibleForTesting 7264 public SafeModeInfo getSafeModeInfoForTests() { 7265 return safeMode; 7266 } 7267 7268 @VisibleForTesting 7269 public void setNNResourceChecker(NameNodeResourceChecker nnResourceChecker) { 7270 this.nnResourceChecker = nnResourceChecker; 7271 } 7272 7273 public SnapshotManager getSnapshotManager() { 7274 return snapshotManager; 7275 } 7276 7277 /** Allow snapshot on a directory. */ 7278 void allowSnapshot(String path) throws IOException { 7279 checkOperation(OperationCategory.WRITE); 7280 final String operationName = "allowSnapshot"; 7281 boolean success = false; 7282 writeLock(); 7283 try { 7284 checkOperation(OperationCategory.WRITE); 7285 checkNameNodeSafeMode("Cannot allow snapshot for " + path); 7286 checkSuperuserPrivilege(); 7287 FSDirSnapshotOp.allowSnapshot(dir, snapshotManager, path); 7288 success = true; 7289 } finally { 7290 writeUnlock(operationName); 7291 } 7292 getEditLog().logSync(); 7293 logAuditEvent(success, operationName, path, null, null); 7294 } 7295 7296 /** Disallow snapshot on a directory. */ 7297 void disallowSnapshot(String path) throws IOException { 7298 checkOperation(OperationCategory.WRITE); 7299 final String operationName = "disallowSnapshot"; 7300 boolean success = false; 7301 writeLock(); 7302 try { 7303 checkOperation(OperationCategory.WRITE); 7304 checkNameNodeSafeMode("Cannot disallow snapshot for " + path); 7305 checkSuperuserPrivilege(); 7306 FSDirSnapshotOp.disallowSnapshot(dir, snapshotManager, path); 7307 success = true; 7308 } finally { 7309 writeUnlock(operationName); 7310 } 7311 getEditLog().logSync(); 7312 logAuditEvent(success, operationName, path, null, null); 7313 } 7314 7315 /** 7316 * Create a snapshot 7317 * @param snapshotRoot The directory path where the snapshot is taken 7318 * @param snapshotName The name of the snapshot 7319 */ 7320 String createSnapshot(String snapshotRoot, String snapshotName, 7321 boolean logRetryCache) throws IOException { 7322 final String operationName = "createSnapshot"; 7323 String snapshotPath = null; 7324 writeLock(); 7325 try { 7326 checkOperation(OperationCategory.WRITE); 7327 checkNameNodeSafeMode("Cannot create snapshot for " + snapshotRoot); 7328 snapshotPath = FSDirSnapshotOp.createSnapshot(dir, 7329 snapshotManager, snapshotRoot, snapshotName, logRetryCache); 7330 } finally { 7331 writeUnlock(operationName); 7332 } 7333 getEditLog().logSync(); 7334 logAuditEvent(snapshotPath != null, operationName, snapshotRoot, 7335 snapshotPath, null); 7336 return snapshotPath; 7337 } 7338 7339 /** 7340 * Rename a snapshot 7341 * @param path The directory path where the snapshot was taken 7342 * @param snapshotOldName Old snapshot name 7343 * @param snapshotNewName New snapshot name 7344 * @throws SafeModeException 7345 * @throws IOException 7346 */ 7347 void renameSnapshot( 7348 String path, String snapshotOldName, String snapshotNewName, 7349 boolean logRetryCache) throws IOException { 7350 final String operationName = "renameSnapshot"; 7351 boolean success = false; 7352 writeLock(); 7353 try { 7354 checkOperation(OperationCategory.WRITE); 7355 checkNameNodeSafeMode("Cannot rename snapshot for " + path); 7356 FSDirSnapshotOp.renameSnapshot(dir, snapshotManager, path, 7357 snapshotOldName, snapshotNewName, logRetryCache); 7358 success = true; 7359 } finally { 7360 writeUnlock(operationName); 7361 } 7362 getEditLog().logSync(); 7363 String oldSnapshotRoot = Snapshot.getSnapshotPath(path, snapshotOldName); 7364 String newSnapshotRoot = Snapshot.getSnapshotPath(path, snapshotNewName); 7365 logAuditEvent(success, operationName, oldSnapshotRoot, 7366 newSnapshotRoot, null); 7367 } 7368 7369 /** 7370 * Get the list of snapshottable directories that are owned 7371 * by the current user. Return all the snapshottable directories if the 7372 * current user is a super user. 7373 * @return The list of all the current snapshottable directories 7374 * @throws IOException 7375 */ 7376 public SnapshottableDirectoryStatus[] getSnapshottableDirListing() 7377 throws IOException { 7378 final String operationName = "listSnapshottableDirectory"; 7379 SnapshottableDirectoryStatus[] status = null; 7380 checkOperation(OperationCategory.READ); 7381 boolean success = false; 7382 readLock(); 7383 try { 7384 checkOperation(OperationCategory.READ); 7385 status = FSDirSnapshotOp.getSnapshottableDirListing(dir, snapshotManager); 7386 success = true; 7387 } finally { 7388 readUnlock(operationName); 7389 } 7390 logAuditEvent(success, operationName, null, null, null); 7391 return status; 7392 } 7393 7394 /** 7395 * Get the difference between two snapshots (or between a snapshot and the 7396 * current status) of a snapshottable directory. 7397 * 7398 * @param path The full path of the snapshottable directory. 7399 * @param fromSnapshot Name of the snapshot to calculate the diff from. Null 7400 * or empty string indicates the current tree. 7401 * @param toSnapshot Name of the snapshot to calculated the diff to. Null or 7402 * empty string indicates the current tree. 7403 * @return A report about the difference between {@code fromSnapshot} and 7404 * {@code toSnapshot}. Modified/deleted/created/renamed files and 7405 * directories belonging to the snapshottable directories are listed 7406 * and labeled as M/-/+/R respectively. 7407 * @throws IOException 7408 */ 7409 SnapshotDiffReport getSnapshotDiffReport(String path, 7410 String fromSnapshot, String toSnapshot) throws IOException { 7411 final String operationName = "computeSnapshotDiff"; 7412 SnapshotDiffReport diffs = null; 7413 checkOperation(OperationCategory.READ); 7414 readLock(); 7415 try { 7416 checkOperation(OperationCategory.READ); 7417 diffs = FSDirSnapshotOp.getSnapshotDiffReport(dir, snapshotManager, 7418 path, fromSnapshot, toSnapshot); 7419 } finally { 7420 readUnlock(operationName); 7421 } 7422 7423 logAuditEvent(diffs != null, operationName, null, null, null); 7424 return diffs; 7425 } 7426 7427 /** 7428 * Delete a snapshot of a snapshottable directory 7429 * @param snapshotRoot The snapshottable directory 7430 * @param snapshotName The name of the to-be-deleted snapshot 7431 * @throws SafeModeException 7432 * @throws IOException 7433 */ 7434 void deleteSnapshot(String snapshotRoot, String snapshotName, 7435 boolean logRetryCache) throws IOException { 7436 final String operationName = "deleteSnapshot"; 7437 boolean success = false; 7438 writeLock(); 7439 BlocksMapUpdateInfo blocksToBeDeleted = null; 7440 try { 7441 checkOperation(OperationCategory.WRITE); 7442 checkNameNodeSafeMode("Cannot delete snapshot for " + snapshotRoot); 7443 7444 blocksToBeDeleted = FSDirSnapshotOp.deleteSnapshot(dir, snapshotManager, 7445 snapshotRoot, snapshotName, logRetryCache); 7446 success = true; 7447 } finally { 7448 writeUnlock(operationName); 7449 } 7450 getEditLog().logSync(); 7451 7452 // Breaking the pattern as removing blocks have to happen outside of the 7453 // global lock 7454 if (blocksToBeDeleted != null) { 7455 removeBlocks(blocksToBeDeleted); 7456 } 7457 7458 String rootPath = Snapshot.getSnapshotPath(snapshotRoot, snapshotName); 7459 logAuditEvent(success, operationName, rootPath, null, null); 7460 } 7461 7462 /** 7463 * Remove a list of INodeDirectorySnapshottable from the SnapshotManager 7464 * @param toRemove the list of INodeDirectorySnapshottable to be removed 7465 */ 7466 void removeSnapshottableDirs(List<INodeDirectory> toRemove) { 7467 if (snapshotManager != null) { 7468 snapshotManager.removeSnapshottable(toRemove); 7469 } 7470 } 7471 7472 RollingUpgradeInfo queryRollingUpgrade() throws IOException { 7473 checkSuperuserPrivilege(); 7474 checkOperation(OperationCategory.READ); 7475 readLock(); 7476 try { 7477 if (!isRollingUpgrade()) { 7478 return null; 7479 } 7480 Preconditions.checkNotNull(rollingUpgradeInfo); 7481 boolean hasRollbackImage = this.getFSImage().hasRollbackFSImage(); 7482 rollingUpgradeInfo.setCreatedRollbackImages(hasRollbackImage); 7483 return rollingUpgradeInfo; 7484 } finally { 7485 readUnlock("queryRollingUpgrade"); 7486 } 7487 } 7488 7489 RollingUpgradeInfo startRollingUpgrade() throws IOException { 7490 final String operationName = "startRollingUpgrade"; 7491 checkSuperuserPrivilege(); 7492 checkOperation(OperationCategory.WRITE); 7493 writeLock(); 7494 try { 7495 checkOperation(OperationCategory.WRITE); 7496 if (isRollingUpgrade()) { 7497 return rollingUpgradeInfo; 7498 } 7499 long startTime = now(); 7500 if (!haEnabled) { // for non-HA, we require NN to be in safemode 7501 startRollingUpgradeInternalForNonHA(startTime); 7502 } else { // for HA, NN cannot be in safemode 7503 checkNameNodeSafeMode("Failed to start rolling upgrade"); 7504 startRollingUpgradeInternal(startTime); 7505 } 7506 7507 getEditLog().logStartRollingUpgrade(rollingUpgradeInfo.getStartTime()); 7508 if (haEnabled) { 7509 // roll the edit log to make sure the standby NameNode can tail 7510 getFSImage().rollEditLog(); 7511 } 7512 } finally { 7513 writeUnlock(operationName); 7514 } 7515 7516 getEditLog().logSync(); 7517 if (auditLog.isInfoEnabled() && isExternalInvocation()) { 7518 logAuditEvent(true, operationName, null, null, null); 7519 } 7520 return rollingUpgradeInfo; 7521 } 7522 7523 /** 7524 * Update internal state to indicate that a rolling upgrade is in progress. 7525 * @param startTime rolling upgrade start time 7526 */ 7527 void startRollingUpgradeInternal(long startTime) 7528 throws IOException { 7529 checkRollingUpgrade("start rolling upgrade"); 7530 getFSImage().checkUpgrade(); 7531 setRollingUpgradeInfo(false, startTime); 7532 } 7533 7534 /** 7535 * Update internal state to indicate that a rolling upgrade is in progress for 7536 * non-HA setup. This requires the namesystem is in SafeMode and after doing a 7537 * checkpoint for rollback the namesystem will quit the safemode automatically 7538 */ 7539 private void startRollingUpgradeInternalForNonHA(long startTime) 7540 throws IOException { 7541 Preconditions.checkState(!haEnabled); 7542 if (!isInSafeMode()) { 7543 throw new IOException("Safe mode should be turned ON " 7544 + "in order to create namespace image."); 7545 } 7546 checkRollingUpgrade("start rolling upgrade"); 7547 getFSImage().checkUpgrade(); 7548 // in non-HA setup, we do an extra checkpoint to generate a rollback image 7549 getFSImage().saveNamespace(this, NameNodeFile.IMAGE_ROLLBACK, null); 7550 LOG.info("Successfully saved namespace for preparing rolling upgrade."); 7551 7552 // leave SafeMode automatically 7553 setSafeMode(SafeModeAction.SAFEMODE_LEAVE); 7554 setRollingUpgradeInfo(true, startTime); 7555 } 7556 7557 void setRollingUpgradeInfo(boolean createdRollbackImages, long startTime) { 7558 rollingUpgradeInfo = new RollingUpgradeInfo(blockPoolId, 7559 createdRollbackImages, startTime, 0L); 7560 } 7561 7562 public void setCreatedRollbackImages(boolean created) { 7563 if (rollingUpgradeInfo != null) { 7564 rollingUpgradeInfo.setCreatedRollbackImages(created); 7565 } 7566 } 7567 7568 public RollingUpgradeInfo getRollingUpgradeInfo() { 7569 return rollingUpgradeInfo; 7570 } 7571 7572 public boolean isNeedRollbackFsImage() { 7573 return needRollbackFsImage; 7574 } 7575 7576 public void setNeedRollbackFsImage(boolean needRollbackFsImage) { 7577 this.needRollbackFsImage = needRollbackFsImage; 7578 } 7579 7580 @Override // NameNodeMXBean 7581 public RollingUpgradeInfo.Bean getRollingUpgradeStatus() { 7582 if (!isRollingUpgrade()) { 7583 return null; 7584 } 7585 RollingUpgradeInfo upgradeInfo = getRollingUpgradeInfo(); 7586 if (upgradeInfo.createdRollbackImages()) { 7587 return new RollingUpgradeInfo.Bean(upgradeInfo); 7588 } 7589 readLock(); 7590 try { 7591 // check again after acquiring the read lock. 7592 upgradeInfo = getRollingUpgradeInfo(); 7593 if (upgradeInfo == null) { 7594 return null; 7595 } 7596 if (!upgradeInfo.createdRollbackImages()) { 7597 boolean hasRollbackImage = this.getFSImage().hasRollbackFSImage(); 7598 upgradeInfo.setCreatedRollbackImages(hasRollbackImage); 7599 } 7600 } catch (IOException ioe) { 7601 LOG.warn("Encountered exception setting Rollback Image", ioe); 7602 } finally { 7603 readUnlock("getRollingUpgradeStatus"); 7604 } 7605 return new RollingUpgradeInfo.Bean(upgradeInfo); 7606 } 7607 7608 /** Is rolling upgrade in progress? */ 7609 public boolean isRollingUpgrade() { 7610 return rollingUpgradeInfo != null && !rollingUpgradeInfo.isFinalized(); 7611 } 7612 7613 void checkRollingUpgrade(String action) throws RollingUpgradeException { 7614 if (isRollingUpgrade()) { 7615 throw new RollingUpgradeException("Failed to " + action 7616 + " since a rolling upgrade is already in progress." 7617 + " Existing rolling upgrade info:\n" + rollingUpgradeInfo); 7618 } 7619 } 7620 7621 RollingUpgradeInfo finalizeRollingUpgrade() throws IOException { 7622 final String operationName = "finalizeRollingUpgrade"; 7623 checkSuperuserPrivilege(); 7624 checkOperation(OperationCategory.WRITE); 7625 writeLock(); 7626 try { 7627 checkOperation(OperationCategory.WRITE); 7628 if (!isRollingUpgrade()) { 7629 return null; 7630 } 7631 checkNameNodeSafeMode("Failed to finalize rolling upgrade"); 7632 7633 finalizeRollingUpgradeInternal(now()); 7634 getEditLog().logFinalizeRollingUpgrade(rollingUpgradeInfo.getFinalizeTime()); 7635 if (haEnabled) { 7636 // roll the edit log to make sure the standby NameNode can tail 7637 getFSImage().rollEditLog(); 7638 } 7639 getFSImage().updateStorageVersion(); 7640 getFSImage().renameCheckpoint(NameNodeFile.IMAGE_ROLLBACK, 7641 NameNodeFile.IMAGE); 7642 } finally { 7643 writeUnlock(operationName); 7644 } 7645 7646 if (!haEnabled) { 7647 // Sync not needed for ha since the edit was rolled after logging. 7648 getEditLog().logSync(); 7649 } 7650 7651 if (auditLog.isInfoEnabled() && isExternalInvocation()) { 7652 logAuditEvent(true, operationName, null, null, null); 7653 } 7654 return rollingUpgradeInfo; 7655 } 7656 7657 void finalizeRollingUpgradeInternal(long finalizeTime) { 7658 // Set the finalize time 7659 rollingUpgradeInfo.finalize(finalizeTime); 7660 } 7661 7662 long addCacheDirective(CacheDirectiveInfo directive, 7663 EnumSet<CacheFlag> flags, boolean logRetryCache) 7664 throws IOException { 7665 final String operationName = "addCacheDirective"; 7666 CacheDirectiveInfo effectiveDirective = null; 7667 if (!flags.contains(CacheFlag.FORCE)) { 7668 cacheManager.waitForRescanIfNeeded(); 7669 } 7670 writeLock(); 7671 try { 7672 checkOperation(OperationCategory.WRITE); 7673 if (isInSafeMode()) { 7674 throw new SafeModeException( 7675 "Cannot add cache directive", safeMode); 7676 } 7677 effectiveDirective = FSNDNCacheOp.addCacheDirective(this, cacheManager, 7678 directive, flags, logRetryCache); 7679 } finally { 7680 writeUnlock(operationName); 7681 boolean success = effectiveDirective != null; 7682 if (success) { 7683 getEditLog().logSync(); 7684 } 7685 7686 String effectiveDirectiveStr = effectiveDirective != null ? 7687 effectiveDirective.toString() : null; 7688 logAuditEvent(success, operationName, effectiveDirectiveStr, 7689 null, null); 7690 } 7691 return effectiveDirective != null ? effectiveDirective.getId() : 0; 7692 } 7693 7694 void modifyCacheDirective(CacheDirectiveInfo directive, 7695 EnumSet<CacheFlag> flags, boolean logRetryCache) throws IOException { 7696 final String operationName = "modifyCacheDirective"; 7697 boolean success = false; 7698 if (!flags.contains(CacheFlag.FORCE)) { 7699 cacheManager.waitForRescanIfNeeded(); 7700 } 7701 writeLock(); 7702 try { 7703 checkOperation(OperationCategory.WRITE); 7704 if (isInSafeMode()) { 7705 throw new SafeModeException( 7706 "Cannot add cache directive", safeMode); 7707 } 7708 FSNDNCacheOp.modifyCacheDirective(this, cacheManager, directive, flags, 7709 logRetryCache); 7710 success = true; 7711 } finally { 7712 writeUnlock(operationName); 7713 if (success) { 7714 getEditLog().logSync(); 7715 } 7716 String idStr = "{id: " + directive.getId().toString() + "}"; 7717 logAuditEvent(success, "modifyCacheDirective", idStr, 7718 directive.toString(), null); 7719 } 7720 } 7721 7722 void removeCacheDirective(long id, boolean logRetryCache) throws IOException { 7723 final String operationName = "removeCacheDirective"; 7724 boolean success = false; 7725 writeLock(); 7726 try { 7727 checkOperation(OperationCategory.WRITE); 7728 if (isInSafeMode()) { 7729 throw new SafeModeException( 7730 "Cannot remove cache directives", safeMode); 7731 } 7732 FSNDNCacheOp.removeCacheDirective(this, cacheManager, id, logRetryCache); 7733 success = true; 7734 } finally { 7735 writeUnlock(operationName); 7736 String idStr = "{id: " + Long.toString(id) + "}"; 7737 logAuditEvent(success, operationName, idStr, null, 7738 null); 7739 } 7740 getEditLog().logSync(); 7741 } 7742 7743 BatchedListEntries<CacheDirectiveEntry> listCacheDirectives( 7744 long startId, CacheDirectiveInfo filter) throws IOException { 7745 final String operationName = "listCacheDirectives"; 7746 checkOperation(OperationCategory.READ); 7747 BatchedListEntries<CacheDirectiveEntry> results; 7748 cacheManager.waitForRescanIfNeeded(); 7749 readLock(); 7750 boolean success = false; 7751 try { 7752 checkOperation(OperationCategory.READ); 7753 results = FSNDNCacheOp.listCacheDirectives(this, cacheManager, startId, 7754 filter); 7755 success = true; 7756 } finally { 7757 readUnlock(operationName); 7758 logAuditEvent(success, operationName, filter.toString(), null, 7759 null); 7760 } 7761 return results; 7762 } 7763 7764 void addCachePool(CachePoolInfo req, boolean logRetryCache) 7765 throws IOException { 7766 final String operationName = "addCachePool"; 7767 writeLock(); 7768 boolean success = false; 7769 String poolInfoStr = null; 7770 try { 7771 checkOperation(OperationCategory.WRITE); 7772 if (isInSafeMode()) { 7773 throw new SafeModeException( 7774 "Cannot add cache pool " + req.getPoolName(), safeMode); 7775 } 7776 CachePoolInfo info = FSNDNCacheOp.addCachePool(this, cacheManager, req, 7777 logRetryCache); 7778 poolInfoStr = info.toString(); 7779 success = true; 7780 } finally { 7781 writeUnlock(operationName); 7782 logAuditEvent(success, operationName, poolInfoStr, null, null); 7783 } 7784 7785 getEditLog().logSync(); 7786 } 7787 7788 void modifyCachePool(CachePoolInfo req, boolean logRetryCache) 7789 throws IOException { 7790 final String operationName = "modifyCachePool"; 7791 writeLock(); 7792 boolean success = false; 7793 try { 7794 checkOperation(OperationCategory.WRITE); 7795 if (isInSafeMode()) { 7796 throw new SafeModeException( 7797 "Cannot modify cache pool " + req.getPoolName(), safeMode); 7798 } 7799 FSNDNCacheOp.modifyCachePool(this, cacheManager, req, logRetryCache); 7800 success = true; 7801 } finally { 7802 writeUnlock(operationName); 7803 String poolNameStr = "{poolName: " + 7804 (req == null ? null : req.getPoolName()) + "}"; 7805 logAuditEvent(success, operationName, poolNameStr, 7806 req == null ? null : req.toString(), null); 7807 } 7808 7809 getEditLog().logSync(); 7810 } 7811 7812 void removeCachePool(String cachePoolName, boolean logRetryCache) 7813 throws IOException { 7814 final String operationName = "removeCachePool"; 7815 writeLock(); 7816 boolean success = false; 7817 try { 7818 checkOperation(OperationCategory.WRITE); 7819 if (isInSafeMode()) { 7820 throw new SafeModeException( 7821 "Cannot remove cache pool " + cachePoolName, safeMode); 7822 } 7823 FSNDNCacheOp.removeCachePool(this, cacheManager, cachePoolName, 7824 logRetryCache); 7825 success = true; 7826 } finally { 7827 writeUnlock(operationName); 7828 String poolNameStr = "{poolName: " + cachePoolName + "}"; 7829 logAuditEvent(success, operationName, poolNameStr, null, null); 7830 } 7831 7832 getEditLog().logSync(); 7833 } 7834 7835 BatchedListEntries<CachePoolEntry> listCachePools(String prevKey) 7836 throws IOException { 7837 final String operationName = "listCachePools"; 7838 BatchedListEntries<CachePoolEntry> results; 7839 checkOperation(OperationCategory.READ); 7840 boolean success = false; 7841 cacheManager.waitForRescanIfNeeded(); 7842 readLock(); 7843 try { 7844 checkOperation(OperationCategory.READ); 7845 results = FSNDNCacheOp.listCachePools(this, cacheManager, prevKey); 7846 success = true; 7847 } finally { 7848 readUnlock(operationName); 7849 logAuditEvent(success, operationName, null, null, null); 7850 } 7851 return results; 7852 } 7853 7854 void modifyAclEntries(final String src, List<AclEntry> aclSpec) 7855 throws IOException { 7856 final String operationName = "modifyAclEntries"; 7857 HdfsFileStatus auditStat = null; 7858 checkOperation(OperationCategory.WRITE); 7859 writeLock(); 7860 try { 7861 checkOperation(OperationCategory.WRITE); 7862 checkNameNodeSafeMode("Cannot modify ACL entries on " + src); 7863 auditStat = FSDirAclOp.modifyAclEntries(dir, src, aclSpec); 7864 } catch (AccessControlException e) { 7865 logAuditEvent(false, operationName, src); 7866 throw e; 7867 } finally { 7868 writeUnlock(operationName); 7869 } 7870 getEditLog().logSync(); 7871 logAuditEvent(true, operationName, src, null, auditStat); 7872 } 7873 7874 void removeAclEntries(final String src, List<AclEntry> aclSpec) 7875 throws IOException { 7876 final String operationName = "removeAclEntries"; 7877 checkOperation(OperationCategory.WRITE); 7878 HdfsFileStatus auditStat = null; 7879 writeLock(); 7880 try { 7881 checkOperation(OperationCategory.WRITE); 7882 checkNameNodeSafeMode("Cannot remove ACL entries on " + src); 7883 auditStat = FSDirAclOp.removeAclEntries(dir, src, aclSpec); 7884 } catch (AccessControlException e) { 7885 logAuditEvent(false, operationName, src); 7886 throw e; 7887 } finally { 7888 writeUnlock(operationName); 7889 } 7890 getEditLog().logSync(); 7891 logAuditEvent(true, operationName, src, null, auditStat); 7892 } 7893 7894 void removeDefaultAcl(final String src) throws IOException { 7895 final String operationName = "removeDefaultAcl"; 7896 HdfsFileStatus auditStat = null; 7897 checkOperation(OperationCategory.WRITE); 7898 writeLock(); 7899 try { 7900 checkOperation(OperationCategory.WRITE); 7901 checkNameNodeSafeMode("Cannot remove default ACL entries on " + src); 7902 auditStat = FSDirAclOp.removeDefaultAcl(dir, src); 7903 } catch (AccessControlException e) { 7904 logAuditEvent(false, operationName, src); 7905 throw e; 7906 } finally { 7907 writeUnlock(operationName); 7908 } 7909 getEditLog().logSync(); 7910 logAuditEvent(true, operationName, src, null, auditStat); 7911 } 7912 7913 void removeAcl(final String src) throws IOException { 7914 final String operationName = "removeAcl"; 7915 HdfsFileStatus auditStat = null; 7916 checkOperation(OperationCategory.WRITE); 7917 writeLock(); 7918 try { 7919 checkOperation(OperationCategory.WRITE); 7920 checkNameNodeSafeMode("Cannot remove ACL on " + src); 7921 auditStat = FSDirAclOp.removeAcl(dir, src); 7922 } catch (AccessControlException e) { 7923 logAuditEvent(false, operationName, src); 7924 throw e; 7925 } finally { 7926 writeUnlock(operationName); 7927 } 7928 getEditLog().logSync(); 7929 logAuditEvent(true, operationName, src, null, auditStat); 7930 } 7931 7932 void setAcl(final String src, List<AclEntry> aclSpec) throws IOException { 7933 final String operationName = "setAcl"; 7934 HdfsFileStatus auditStat = null; 7935 checkOperation(OperationCategory.WRITE); 7936 writeLock(); 7937 try { 7938 checkOperation(OperationCategory.WRITE); 7939 checkNameNodeSafeMode("Cannot set ACL on " + src); 7940 auditStat = FSDirAclOp.setAcl(dir, src, aclSpec); 7941 } catch (AccessControlException e) { 7942 logAuditEvent(false, operationName, src); 7943 throw e; 7944 } finally { 7945 writeUnlock(operationName); 7946 } 7947 getEditLog().logSync(); 7948 logAuditEvent(true, operationName, src, null, auditStat); 7949 } 7950 7951 AclStatus getAclStatus(String src) throws IOException { 7952 final String operationName = "getAclStatus"; 7953 checkOperation(OperationCategory.READ); 7954 boolean success = false; 7955 readLock(); 7956 try { 7957 checkOperation(OperationCategory.READ); 7958 final AclStatus ret = FSDirAclOp.getAclStatus(dir, src); 7959 success = true; 7960 return ret; 7961 } finally { 7962 readUnlock(operationName); 7963 logAuditEvent(success, operationName, src); 7964 } 7965 } 7966 7967 /** 7968 * Create an encryption zone on directory src using the specified key. 7969 * 7970 * @param src the path of a directory which will be the root of the 7971 * encryption zone. The directory must be empty. 7972 * @param keyName name of a key which must be present in the configured 7973 * KeyProvider. 7974 * @throws AccessControlException if the caller is not the superuser. 7975 * @throws UnresolvedLinkException if the path can't be resolved. 7976 * @throws SafeModeException if the Namenode is in safe mode. 7977 */ 7978 void createEncryptionZone(final String src, final String keyName, 7979 boolean logRetryCache) 7980 throws IOException, UnresolvedLinkException, 7981 SafeModeException, AccessControlException { 7982 try { 7983 if (provider == null) { 7984 throw new IOException( 7985 "Can't create an encryption zone for " + src + 7986 " since no key provider is available."); 7987 } 7988 if (keyName == null || keyName.isEmpty()) { 7989 throw new IOException("Must specify a key name when creating an " + 7990 "encryption zone"); 7991 } 7992 KeyProvider.Metadata metadata = provider.getMetadata(keyName); 7993 if (metadata == null) { 7994 /* 7995 * It would be nice if we threw something more specific than 7996 * IOException when the key is not found, but the KeyProvider API 7997 * doesn't provide for that. If that API is ever changed to throw 7998 * something more specific (e.g. UnknownKeyException) then we can 7999 * update this to match it, or better yet, just rethrow the 8000 * KeyProvider's exception. 8001 */ 8002 throw new IOException("Key " + keyName + " doesn't exist."); 8003 } 8004 // If the provider supports pool for EDEKs, this will fill in the pool 8005 generateEncryptedDataEncryptionKey(keyName); 8006 createEncryptionZoneInt(src, metadata.getCipher(), 8007 keyName, logRetryCache); 8008 } catch (AccessControlException e) { 8009 logAuditEvent(false, "createEncryptionZone", src); 8010 throw e; 8011 } 8012 } 8013 8014 private void createEncryptionZoneInt(final String srcArg, String cipher, 8015 String keyName, final boolean logRetryCache) throws IOException { 8016 final String operationName = "createEncryptionZone"; 8017 String src = srcArg; 8018 HdfsFileStatus resultingStat = null; 8019 checkSuperuserPrivilege(); 8020 FSPermissionChecker pc = getPermissionChecker(); 8021 writeLock(); 8022 try { 8023 checkSuperuserPrivilege(); 8024 checkOperation(OperationCategory.WRITE); 8025 checkNameNodeSafeMode("Cannot create encryption zone on " + src); 8026 final INodesInPath iip = dir.resolvePathForWrite(pc, src); 8027 src = iip.getPath(); 8028 8029 final CipherSuite suite = CipherSuite.convert(cipher); 8030 // For now this is hardcoded, as we only support one method. 8031 final CryptoProtocolVersion version = 8032 CryptoProtocolVersion.ENCRYPTION_ZONES; 8033 final XAttr ezXAttr = dir.createEncryptionZone(src, suite, 8034 version, keyName); 8035 List<XAttr> xAttrs = Lists.newArrayListWithCapacity(1); 8036 xAttrs.add(ezXAttr); 8037 getEditLog().logSetXAttrs(src, xAttrs, logRetryCache); 8038 resultingStat = dir.getAuditFileInfo(iip); 8039 } finally { 8040 writeUnlock(operationName); 8041 } 8042 getEditLog().logSync(); 8043 logAuditEvent(true, operationName, srcArg, null, resultingStat); 8044 } 8045 8046 /** 8047 * Get the encryption zone for the specified path. 8048 * 8049 * @param srcArg the path of a file or directory to get the EZ for. 8050 * @return the EZ of the of the path or null if none. 8051 * @throws AccessControlException if the caller is not the superuser. 8052 * @throws UnresolvedLinkException if the path can't be resolved. 8053 */ 8054 EncryptionZone getEZForPath(final String srcArg) 8055 throws AccessControlException, UnresolvedLinkException, IOException { 8056 String src = srcArg; 8057 final String operationName = "getEZForPath"; 8058 HdfsFileStatus resultingStat = null; 8059 boolean success = false; 8060 final FSPermissionChecker pc = getPermissionChecker(); 8061 checkOperation(OperationCategory.READ); 8062 readLock(); 8063 try { 8064 checkOperation(OperationCategory.READ); 8065 INodesInPath iip = dir.resolvePath(pc, src); 8066 if (isPermissionEnabled) { 8067 dir.checkPathAccess(pc, iip, FsAction.READ); 8068 } 8069 final EncryptionZone ret = dir.getEZForPath(iip); 8070 resultingStat = dir.getAuditFileInfo(iip); 8071 success = true; 8072 return ret; 8073 } finally { 8074 readUnlock(operationName); 8075 logAuditEvent(success, operationName, srcArg, null, resultingStat); 8076 } 8077 } 8078 8079 BatchedListEntries<EncryptionZone> listEncryptionZones(long prevId) 8080 throws IOException { 8081 final String operationName = "listEncryptionZones"; 8082 boolean success = false; 8083 checkSuperuserPrivilege(); 8084 checkOperation(OperationCategory.READ); 8085 readLock(); 8086 try { 8087 checkSuperuserPrivilege(); 8088 checkOperation(OperationCategory.READ); 8089 final BatchedListEntries<EncryptionZone> ret = 8090 dir.listEncryptionZones(prevId); 8091 success = true; 8092 return ret; 8093 } finally { 8094 readUnlock(operationName); 8095 logAuditEvent(success, operationName, null); 8096 } 8097 } 8098 8099 void setXAttr(String src, XAttr xAttr, EnumSet<XAttrSetFlag> flag, 8100 boolean logRetryCache) 8101 throws IOException { 8102 final String operationName = "setXAttr"; 8103 HdfsFileStatus auditStat = null; 8104 writeLock(); 8105 try { 8106 checkOperation(OperationCategory.WRITE); 8107 checkNameNodeSafeMode("Cannot set XAttr on " + src); 8108 auditStat = FSDirXAttrOp.setXAttr(dir, src, xAttr, flag, logRetryCache); 8109 } catch (AccessControlException e) { 8110 logAuditEvent(false, operationName, src); 8111 throw e; 8112 } finally { 8113 writeUnlock(operationName); 8114 } 8115 getEditLog().logSync(); 8116 logAuditEvent(true, operationName, src, null, auditStat); 8117 } 8118 8119 List<XAttr> getXAttrs(final String src, List<XAttr> xAttrs) 8120 throws IOException { 8121 final String operationName = "getXAttrs"; 8122 checkOperation(OperationCategory.READ); 8123 readLock(); 8124 try { 8125 checkOperation(OperationCategory.READ); 8126 return FSDirXAttrOp.getXAttrs(dir, src, xAttrs); 8127 } catch (AccessControlException e) { 8128 logAuditEvent(false, operationName, src); 8129 throw e; 8130 } finally { 8131 readUnlock(operationName); 8132 } 8133 } 8134 8135 List<XAttr> listXAttrs(String src) throws IOException { 8136 final String operationName = "listXAttrs"; 8137 checkOperation(OperationCategory.READ); 8138 readLock(); 8139 try { 8140 checkOperation(OperationCategory.READ); 8141 return FSDirXAttrOp.listXAttrs(dir, src); 8142 } catch (AccessControlException e) { 8143 logAuditEvent(false, operationName, src); 8144 throw e; 8145 } finally { 8146 readUnlock(operationName); 8147 } 8148 } 8149 8150 void removeXAttr(String src, XAttr xAttr, boolean logRetryCache) 8151 throws IOException { 8152 final String operationName = "removeXAttr"; 8153 HdfsFileStatus auditStat = null; 8154 writeLock(); 8155 try { 8156 checkOperation(OperationCategory.WRITE); 8157 checkNameNodeSafeMode("Cannot remove XAttr entry on " + src); 8158 auditStat = FSDirXAttrOp.removeXAttr(dir, src, xAttr, logRetryCache); 8159 } catch (AccessControlException e) { 8160 logAuditEvent(false, operationName, src); 8161 throw e; 8162 } finally { 8163 writeUnlock(operationName); 8164 } 8165 getEditLog().logSync(); 8166 logAuditEvent(true, operationName, src, null, auditStat); 8167 } 8168 8169 void checkAccess(String src, FsAction mode) throws IOException { 8170 final String operationName = "checkAccess"; 8171 checkOperation(OperationCategory.READ); 8172 FSPermissionChecker pc = getPermissionChecker(); 8173 readLock(); 8174 try { 8175 checkOperation(OperationCategory.READ); 8176 final INodesInPath iip = dir.resolvePath(pc, src); 8177 src = iip.getPath(); 8178 INode inode = iip.getLastINode(); 8179 if (inode == null) { 8180 throw new FileNotFoundException("Path not found"); 8181 } 8182 if (isPermissionEnabled) { 8183 dir.checkPathAccess(pc, iip, mode); 8184 } 8185 } catch (AccessControlException e) { 8186 logAuditEvent(false, operationName, src); 8187 throw e; 8188 } finally { 8189 readUnlock(operationName); 8190 } 8191 } 8192 8193 /** 8194 * Default AuditLogger implementation; used when no access logger is 8195 * defined in the config file. It can also be explicitly listed in the 8196 * config file. 8197 */ 8198 private static class DefaultAuditLogger extends HdfsAuditLogger { 8199 8200 private boolean logTokenTrackingId; 8201 8202 @Override 8203 public void initialize(Configuration conf) { 8204 logTokenTrackingId = conf.getBoolean( 8205 DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_KEY, 8206 DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_DEFAULT); 8207 } 8208 8209 @Override 8210 public void logAuditEvent(boolean succeeded, String userName, 8211 InetAddress addr, String cmd, String src, String dst, 8212 FileStatus status, UserGroupInformation ugi, 8213 DelegationTokenSecretManager dtSecretManager) { 8214 if (auditLog.isInfoEnabled()) { 8215 final StringBuilder sb = auditBuffer.get(); 8216 sb.setLength(0); 8217 sb.append("allowed=").append(succeeded).append("\t"); 8218 sb.append("ugi=").append(userName).append("\t"); 8219 sb.append("ip=").append(addr).append("\t"); 8220 sb.append("cmd=").append(cmd).append("\t"); 8221 sb.append("src=").append(src).append("\t"); 8222 sb.append("dst=").append(dst).append("\t"); 8223 if (null == status) { 8224 sb.append("perm=null"); 8225 } else { 8226 sb.append("perm="); 8227 sb.append(status.getOwner()).append(":"); 8228 sb.append(status.getGroup()).append(":"); 8229 sb.append(status.getPermission()); 8230 } 8231 if (logTokenTrackingId) { 8232 sb.append("\t").append("trackingId="); 8233 String trackingId = null; 8234 if (ugi != null && dtSecretManager != null 8235 && ugi.getAuthenticationMethod() == AuthenticationMethod.TOKEN) { 8236 for (TokenIdentifier tid: ugi.getTokenIdentifiers()) { 8237 if (tid instanceof DelegationTokenIdentifier) { 8238 DelegationTokenIdentifier dtid = 8239 (DelegationTokenIdentifier)tid; 8240 trackingId = dtSecretManager.getTokenTrackingId(dtid); 8241 break; 8242 } 8243 } 8244 } 8245 sb.append(trackingId); 8246 } 8247 sb.append("\t").append("proto="); 8248 sb.append(NamenodeWebHdfsMethods.isWebHdfsInvocation() ? "webhdfs" : "rpc"); 8249 logAuditMessage(sb.toString()); 8250 } 8251 } 8252 8253 public void logAuditMessage(String message) { 8254 auditLog.info(message); 8255 } 8256 } 8257 8258 private static void enableAsyncAuditLog() { 8259 if (!(auditLog instanceof Log4JLogger)) { 8260 LOG.warn("Log4j is required to enable async auditlog"); 8261 return; 8262 } 8263 Logger logger = ((Log4JLogger)auditLog).getLogger(); 8264 @SuppressWarnings("unchecked") 8265 List<Appender> appenders = Collections.list(logger.getAllAppenders()); 8266 // failsafe against trying to async it more than once 8267 if (!appenders.isEmpty() && !(appenders.get(0) instanceof AsyncAppender)) { 8268 AsyncAppender asyncAppender = new AsyncAppender(); 8269 // change logger to have an async appender containing all the 8270 // previously configured appenders 8271 for (Appender appender : appenders) { 8272 logger.removeAppender(appender); 8273 asyncAppender.addAppender(appender); 8274 } 8275 logger.addAppender(asyncAppender); 8276 } 8277 } 8278 8279} 8280