001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hdfs.server.blockmanagement; 019 020import java.util.*; 021 022import org.apache.hadoop.conf.Configuration; 023 024import org.apache.hadoop.fs.StorageType; 025import org.apache.hadoop.hdfs.DFSUtil; 026import org.apache.hadoop.hdfs.protocol.DatanodeInfo; 027import org.apache.hadoop.net.NetworkTopology; 028import org.apache.hadoop.net.NetworkTopologyWithNodeGroup; 029import org.apache.hadoop.net.Node; 030import org.apache.hadoop.net.NodeBase; 031 032/** The class is responsible for choosing the desired number of targets 033 * for placing block replicas on environment with node-group layer. 034 * The replica placement strategy is adjusted to: 035 * If the writer is on a datanode, the 1st replica is placed on the local 036 * node (or local node-group), otherwise a random datanode. 037 * The 2nd replica is placed on a datanode that is on a different rack with 1st 038 * replica node. 039 * The 3rd replica is placed on a datanode which is on a different node-group 040 * but the same rack as the second replica node. 041 */ 042public class BlockPlacementPolicyWithNodeGroup extends BlockPlacementPolicyDefault { 043 044 protected BlockPlacementPolicyWithNodeGroup(Configuration conf, FSClusterStats stats, 045 NetworkTopology clusterMap, DatanodeManager datanodeManager) { 046 initialize(conf, stats, clusterMap, host2datanodeMap); 047 } 048 049 protected BlockPlacementPolicyWithNodeGroup() { 050 } 051 052 public void initialize(Configuration conf, FSClusterStats stats, 053 NetworkTopology clusterMap, 054 Host2NodesMap host2datanodeMap) { 055 super.initialize(conf, stats, clusterMap, host2datanodeMap); 056 } 057 058 /** choose local node of localMachine as the target. 059 * if localMachine is not available, choose a node on the same nodegroup or 060 * rack instead. 061 * @return the chosen node 062 */ 063 @Override 064 protected DatanodeStorageInfo chooseLocalStorage(Node localMachine, 065 Set<Node> excludedNodes, long blocksize, int maxNodesPerRack, 066 List<DatanodeStorageInfo> results, boolean avoidStaleNodes, 067 EnumMap<StorageType, Integer> storageTypes, boolean fallbackToLocalRack) 068 throws NotEnoughReplicasException { 069 // if no local machine, randomly choose one node 070 if (localMachine == null) 071 return chooseRandom(NodeBase.ROOT, excludedNodes, 072 blocksize, maxNodesPerRack, results, avoidStaleNodes, storageTypes); 073 074 // otherwise try local machine first 075 if (localMachine instanceof DatanodeDescriptor) { 076 DatanodeDescriptor localDataNode = (DatanodeDescriptor)localMachine; 077 if (excludedNodes.add(localMachine)) { // was not in the excluded list 078 for (Iterator<Map.Entry<StorageType, Integer>> iter = storageTypes 079 .entrySet().iterator(); iter.hasNext(); ) { 080 Map.Entry<StorageType, Integer> entry = iter.next(); 081 for (DatanodeStorageInfo localStorage : DFSUtil.shuffle( 082 localDataNode.getStorageInfos())) { 083 StorageType type = entry.getKey(); 084 if (addIfIsGoodTarget(localStorage, excludedNodes, blocksize, 085 maxNodesPerRack, false, results, avoidStaleNodes, type) >= 0) { 086 int num = entry.getValue(); 087 if (num == 1) { 088 iter.remove(); 089 } else { 090 entry.setValue(num - 1); 091 } 092 return localStorage; 093 } 094 } 095 } 096 } 097 } 098 099 // try a node on local node group 100 DatanodeStorageInfo chosenStorage = chooseLocalNodeGroup( 101 (NetworkTopologyWithNodeGroup)clusterMap, localMachine, excludedNodes, 102 blocksize, maxNodesPerRack, results, avoidStaleNodes, storageTypes); 103 if (chosenStorage != null) { 104 return chosenStorage; 105 } 106 107 if (!fallbackToLocalRack) { 108 return null; 109 } 110 // try a node on local rack 111 return chooseLocalRack(localMachine, excludedNodes, 112 blocksize, maxNodesPerRack, results, avoidStaleNodes, storageTypes); 113 } 114 115 /** @return the node of the second replica */ 116 private static DatanodeDescriptor secondNode(Node localMachine, 117 List<DatanodeStorageInfo> results) { 118 // find the second replica 119 for(DatanodeStorageInfo nextStorage : results) { 120 DatanodeDescriptor nextNode = nextStorage.getDatanodeDescriptor(); 121 if (nextNode != localMachine) { 122 return nextNode; 123 } 124 } 125 return null; 126 } 127 128 @Override 129 protected DatanodeStorageInfo chooseLocalRack(Node localMachine, 130 Set<Node> excludedNodes, long blocksize, int maxNodesPerRack, 131 List<DatanodeStorageInfo> results, boolean avoidStaleNodes, 132 EnumMap<StorageType, Integer> storageTypes) throws 133 NotEnoughReplicasException { 134 // no local machine, so choose a random machine 135 if (localMachine == null) { 136 return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize, 137 maxNodesPerRack, results, avoidStaleNodes, storageTypes); 138 } 139 140 // choose one from the local rack, but off-nodegroup 141 try { 142 final String scope = NetworkTopology.getFirstHalf(localMachine.getNetworkLocation()); 143 return chooseRandom(scope, excludedNodes, blocksize, maxNodesPerRack, 144 results, avoidStaleNodes, storageTypes); 145 } catch (NotEnoughReplicasException e1) { 146 // find the second replica 147 final DatanodeDescriptor newLocal = secondNode(localMachine, results); 148 if (newLocal != null) { 149 try { 150 return chooseRandom( 151 clusterMap.getRack(newLocal.getNetworkLocation()), excludedNodes, 152 blocksize, maxNodesPerRack, results, avoidStaleNodes, 153 storageTypes); 154 } catch(NotEnoughReplicasException e2) { 155 //otherwise randomly choose one from the network 156 return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize, 157 maxNodesPerRack, results, avoidStaleNodes, storageTypes); 158 } 159 } else { 160 //otherwise randomly choose one from the network 161 return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize, 162 maxNodesPerRack, results, avoidStaleNodes, storageTypes); 163 } 164 } 165 } 166 167 /** 168 * {@inheritDoc} 169 */ 170 @Override 171 protected void chooseRemoteRack(int numOfReplicas, 172 DatanodeDescriptor localMachine, Set<Node> excludedNodes, 173 long blocksize, int maxReplicasPerRack, List<DatanodeStorageInfo> results, 174 boolean avoidStaleNodes, EnumMap<StorageType, Integer> storageTypes) 175 throws NotEnoughReplicasException { 176 int oldNumOfReplicas = results.size(); 177 178 final String rackLocation = NetworkTopology.getFirstHalf( 179 localMachine.getNetworkLocation()); 180 try { 181 // randomly choose from remote racks 182 chooseRandom(numOfReplicas, "~" + rackLocation, excludedNodes, blocksize, 183 maxReplicasPerRack, results, avoidStaleNodes, storageTypes); 184 } catch (NotEnoughReplicasException e) { 185 // fall back to the local rack 186 chooseRandom(numOfReplicas - (results.size() - oldNumOfReplicas), 187 rackLocation, excludedNodes, blocksize, 188 maxReplicasPerRack, results, avoidStaleNodes, storageTypes); 189 } 190 } 191 192 /* choose one node from the nodegroup that <i>localMachine</i> is on. 193 * if no such node is available, choose one node from the nodegroup where 194 * a second replica is on. 195 * if still no such node is available, choose a random node in the cluster. 196 * @return the chosen node 197 */ 198 private DatanodeStorageInfo chooseLocalNodeGroup( 199 NetworkTopologyWithNodeGroup clusterMap, Node localMachine, 200 Set<Node> excludedNodes, long blocksize, int maxNodesPerRack, 201 List<DatanodeStorageInfo> results, boolean avoidStaleNodes, 202 EnumMap<StorageType, Integer> storageTypes) throws 203 NotEnoughReplicasException { 204 // no local machine, so choose a random machine 205 if (localMachine == null) { 206 return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize, 207 maxNodesPerRack, results, avoidStaleNodes, storageTypes); 208 } 209 210 // choose one from the local node group 211 try { 212 return chooseRandom( 213 clusterMap.getNodeGroup(localMachine.getNetworkLocation()), 214 excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes, 215 storageTypes); 216 } catch (NotEnoughReplicasException e1) { 217 final DatanodeDescriptor newLocal = secondNode(localMachine, results); 218 if (newLocal != null) { 219 try { 220 return chooseRandom( 221 clusterMap.getNodeGroup(newLocal.getNetworkLocation()), 222 excludedNodes, blocksize, maxNodesPerRack, results, 223 avoidStaleNodes, storageTypes); 224 } catch(NotEnoughReplicasException e2) { 225 //otherwise randomly choose one from the network 226 return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize, 227 maxNodesPerRack, results, avoidStaleNodes, storageTypes); 228 } 229 } else { 230 //otherwise randomly choose one from the network 231 return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize, 232 maxNodesPerRack, results, avoidStaleNodes, storageTypes); 233 } 234 } 235 } 236 237 @Override 238 protected String getRack(final DatanodeInfo cur) { 239 String nodeGroupString = cur.getNetworkLocation(); 240 return NetworkTopology.getFirstHalf(nodeGroupString); 241 } 242 243 /** 244 * Find other nodes in the same nodegroup of <i>localMachine</i> and add them 245 * into <i>excludeNodes</i> as replica should not be duplicated for nodes 246 * within the same nodegroup 247 * @return number of new excluded nodes 248 */ 249 @Override 250 protected int addToExcludedNodes(DatanodeDescriptor chosenNode, 251 Set<Node> excludedNodes) { 252 int countOfExcludedNodes = 0; 253 String nodeGroupScope = chosenNode.getNetworkLocation(); 254 List<Node> leafNodes = clusterMap.getLeaves(nodeGroupScope); 255 for (Node leafNode : leafNodes) { 256 if (excludedNodes.add(leafNode)) { 257 // not a existing node in excludedNodes 258 countOfExcludedNodes++; 259 } 260 } 261 262 countOfExcludedNodes += addDependentNodesToExcludedNodes( 263 chosenNode, excludedNodes); 264 return countOfExcludedNodes; 265 } 266 267 /** 268 * Add all nodes from a dependent nodes list to excludedNodes. 269 * @return number of new excluded nodes 270 */ 271 private int addDependentNodesToExcludedNodes(DatanodeDescriptor chosenNode, 272 Set<Node> excludedNodes) { 273 if (this.host2datanodeMap == null) { 274 return 0; 275 } 276 int countOfExcludedNodes = 0; 277 for(String hostname : chosenNode.getDependentHostNames()) { 278 DatanodeDescriptor node = 279 this.host2datanodeMap.getDataNodeByHostName(hostname); 280 if(node!=null) { 281 if (excludedNodes.add(node)) { 282 countOfExcludedNodes++; 283 } 284 } else { 285 LOG.warn("Not able to find datanode " + hostname 286 + " which has dependency with datanode " 287 + chosenNode.getHostName()); 288 } 289 } 290 291 return countOfExcludedNodes; 292 } 293 294 /** 295 * Pick up replica node set for deleting replica as over-replicated. 296 * First set contains replica nodes on rack with more than one 297 * replica while second set contains remaining replica nodes. 298 * If first is not empty, divide first set into two subsets: 299 * moreThanOne contains nodes on nodegroup with more than one replica 300 * exactlyOne contains the remaining nodes in first set 301 * then pickup priSet if not empty. 302 * If first is empty, then pick second. 303 */ 304 @Override 305 public Collection<DatanodeStorageInfo> pickupReplicaSet( 306 Collection<DatanodeStorageInfo> first, 307 Collection<DatanodeStorageInfo> second, 308 Map<String, List<DatanodeStorageInfo>> rackMap) { 309 // If no replica within same rack, return directly. 310 if (first.isEmpty()) { 311 return second; 312 } 313 // Split data nodes in the first set into two sets, 314 // moreThanOne contains nodes on nodegroup with more than one replica 315 // exactlyOne contains the remaining nodes 316 Map<String, List<DatanodeStorageInfo>> nodeGroupMap = 317 new HashMap<String, List<DatanodeStorageInfo>>(); 318 319 for(DatanodeStorageInfo storage : first) { 320 final String nodeGroupName = NetworkTopology.getLastHalf( 321 storage.getDatanodeDescriptor().getNetworkLocation()); 322 List<DatanodeStorageInfo> storageList = nodeGroupMap.get(nodeGroupName); 323 if (storageList == null) { 324 storageList = new ArrayList<DatanodeStorageInfo>(); 325 nodeGroupMap.put(nodeGroupName, storageList); 326 } 327 storageList.add(storage); 328 } 329 330 final List<DatanodeStorageInfo> moreThanOne = new ArrayList<DatanodeStorageInfo>(); 331 final List<DatanodeStorageInfo> exactlyOne = new ArrayList<DatanodeStorageInfo>(); 332 // split nodes into two sets 333 for(List<DatanodeStorageInfo> datanodeList : nodeGroupMap.values()) { 334 if (datanodeList.size() == 1 ) { 335 // exactlyOne contains nodes on nodegroup with exactly one replica 336 exactlyOne.add(datanodeList.get(0)); 337 } else { 338 // moreThanOne contains nodes on nodegroup with more than one replica 339 moreThanOne.addAll(datanodeList); 340 } 341 } 342 343 return moreThanOne.isEmpty()? exactlyOne : moreThanOne; 344 } 345 346}