001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs.server.blockmanagement;
019
020import java.util.*;
021
022import org.apache.hadoop.conf.Configuration;
023
024import org.apache.hadoop.fs.StorageType;
025import org.apache.hadoop.hdfs.DFSUtil;
026import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
027import org.apache.hadoop.net.NetworkTopology;
028import org.apache.hadoop.net.NetworkTopologyWithNodeGroup;
029import org.apache.hadoop.net.Node;
030import org.apache.hadoop.net.NodeBase;
031
032/** The class is responsible for choosing the desired number of targets
033 * for placing block replicas on environment with node-group layer.
034 * The replica placement strategy is adjusted to:
035 * If the writer is on a datanode, the 1st replica is placed on the local 
036 *     node (or local node-group), otherwise a random datanode. 
037 * The 2nd replica is placed on a datanode that is on a different rack with 1st
038 *     replica node. 
039 * The 3rd replica is placed on a datanode which is on a different node-group
040 *     but the same rack as the second replica node.
041 */
042public class BlockPlacementPolicyWithNodeGroup extends BlockPlacementPolicyDefault {
043
044  protected BlockPlacementPolicyWithNodeGroup(Configuration conf,  FSClusterStats stats,
045      NetworkTopology clusterMap, DatanodeManager datanodeManager) {
046    initialize(conf, stats, clusterMap, host2datanodeMap);
047  }
048
049  protected BlockPlacementPolicyWithNodeGroup() {
050  }
051
052  public void initialize(Configuration conf,  FSClusterStats stats,
053          NetworkTopology clusterMap, 
054          Host2NodesMap host2datanodeMap) {
055    super.initialize(conf, stats, clusterMap, host2datanodeMap);
056  }
057
058  /** choose local node of localMachine as the target.
059   * if localMachine is not available, choose a node on the same nodegroup or 
060   * rack instead.
061   * @return the chosen node
062   */
063  @Override
064  protected DatanodeStorageInfo chooseLocalStorage(Node localMachine,
065      Set<Node> excludedNodes, long blocksize, int maxNodesPerRack,
066      List<DatanodeStorageInfo> results, boolean avoidStaleNodes,
067      EnumMap<StorageType, Integer> storageTypes, boolean fallbackToLocalRack)
068      throws NotEnoughReplicasException {
069    // if no local machine, randomly choose one node
070    if (localMachine == null)
071      return chooseRandom(NodeBase.ROOT, excludedNodes, 
072          blocksize, maxNodesPerRack, results, avoidStaleNodes, storageTypes);
073
074    // otherwise try local machine first
075    if (localMachine instanceof DatanodeDescriptor) {
076      DatanodeDescriptor localDataNode = (DatanodeDescriptor)localMachine;
077      if (excludedNodes.add(localMachine)) { // was not in the excluded list
078        for (Iterator<Map.Entry<StorageType, Integer>> iter = storageTypes
079            .entrySet().iterator(); iter.hasNext(); ) {
080          Map.Entry<StorageType, Integer> entry = iter.next();
081          for (DatanodeStorageInfo localStorage : DFSUtil.shuffle(
082              localDataNode.getStorageInfos())) {
083            StorageType type = entry.getKey();
084            if (addIfIsGoodTarget(localStorage, excludedNodes, blocksize,
085                maxNodesPerRack, false, results, avoidStaleNodes, type) >= 0) {
086              int num = entry.getValue();
087              if (num == 1) {
088                iter.remove();
089              } else {
090                entry.setValue(num - 1);
091              }
092              return localStorage;
093            }
094          }
095        }
096      }
097    }
098
099    // try a node on local node group
100    DatanodeStorageInfo chosenStorage = chooseLocalNodeGroup(
101        (NetworkTopologyWithNodeGroup)clusterMap, localMachine, excludedNodes, 
102        blocksize, maxNodesPerRack, results, avoidStaleNodes, storageTypes);
103    if (chosenStorage != null) {
104      return chosenStorage;
105    }
106
107    if (!fallbackToLocalRack) {
108      return null;
109    }
110    // try a node on local rack
111    return chooseLocalRack(localMachine, excludedNodes, 
112        blocksize, maxNodesPerRack, results, avoidStaleNodes, storageTypes);
113  }
114
115  /** @return the node of the second replica */
116  private static DatanodeDescriptor secondNode(Node localMachine,
117      List<DatanodeStorageInfo> results) {
118    // find the second replica
119    for(DatanodeStorageInfo nextStorage : results) {
120      DatanodeDescriptor nextNode = nextStorage.getDatanodeDescriptor();
121      if (nextNode != localMachine) {
122        return nextNode;
123      }
124    }
125    return null;
126  }
127
128  @Override
129  protected DatanodeStorageInfo chooseLocalRack(Node localMachine,
130      Set<Node> excludedNodes, long blocksize, int maxNodesPerRack,
131      List<DatanodeStorageInfo> results, boolean avoidStaleNodes,
132      EnumMap<StorageType, Integer> storageTypes) throws
133      NotEnoughReplicasException {
134    // no local machine, so choose a random machine
135    if (localMachine == null) {
136      return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
137          maxNodesPerRack, results, avoidStaleNodes, storageTypes);
138    }
139
140    // choose one from the local rack, but off-nodegroup
141    try {
142      final String scope = NetworkTopology.getFirstHalf(localMachine.getNetworkLocation());
143      return chooseRandom(scope, excludedNodes, blocksize, maxNodesPerRack,
144          results, avoidStaleNodes, storageTypes);
145    } catch (NotEnoughReplicasException e1) {
146      // find the second replica
147      final DatanodeDescriptor newLocal = secondNode(localMachine, results);
148      if (newLocal != null) {
149        try {
150          return chooseRandom(
151              clusterMap.getRack(newLocal.getNetworkLocation()), excludedNodes,
152              blocksize, maxNodesPerRack, results, avoidStaleNodes,
153              storageTypes);
154        } catch(NotEnoughReplicasException e2) {
155          //otherwise randomly choose one from the network
156          return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
157              maxNodesPerRack, results, avoidStaleNodes, storageTypes);
158        }
159      } else {
160        //otherwise randomly choose one from the network
161        return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
162            maxNodesPerRack, results, avoidStaleNodes, storageTypes);
163      }
164    }
165  }
166
167  /**
168   * {@inheritDoc}
169   */
170  @Override
171  protected void chooseRemoteRack(int numOfReplicas,
172      DatanodeDescriptor localMachine, Set<Node> excludedNodes,
173      long blocksize, int maxReplicasPerRack, List<DatanodeStorageInfo> results,
174      boolean avoidStaleNodes, EnumMap<StorageType, Integer> storageTypes)
175      throws NotEnoughReplicasException {
176    int oldNumOfReplicas = results.size();
177
178    final String rackLocation = NetworkTopology.getFirstHalf(
179        localMachine.getNetworkLocation());
180    try {
181      // randomly choose from remote racks
182      chooseRandom(numOfReplicas, "~" + rackLocation, excludedNodes, blocksize,
183          maxReplicasPerRack, results, avoidStaleNodes, storageTypes);
184    } catch (NotEnoughReplicasException e) {
185      // fall back to the local rack
186      chooseRandom(numOfReplicas - (results.size() - oldNumOfReplicas),
187          rackLocation, excludedNodes, blocksize,
188          maxReplicasPerRack, results, avoidStaleNodes, storageTypes);
189    }
190  }
191
192  /* choose one node from the nodegroup that <i>localMachine</i> is on.
193   * if no such node is available, choose one node from the nodegroup where
194   * a second replica is on.
195   * if still no such node is available, choose a random node in the cluster.
196   * @return the chosen node
197   */
198  private DatanodeStorageInfo chooseLocalNodeGroup(
199      NetworkTopologyWithNodeGroup clusterMap, Node localMachine,
200      Set<Node> excludedNodes, long blocksize, int maxNodesPerRack,
201      List<DatanodeStorageInfo> results, boolean avoidStaleNodes,
202      EnumMap<StorageType, Integer> storageTypes) throws
203      NotEnoughReplicasException {
204    // no local machine, so choose a random machine
205    if (localMachine == null) {
206      return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
207          maxNodesPerRack, results, avoidStaleNodes, storageTypes);
208    }
209
210    // choose one from the local node group
211    try {
212      return chooseRandom(
213          clusterMap.getNodeGroup(localMachine.getNetworkLocation()),
214          excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes,
215          storageTypes);
216    } catch (NotEnoughReplicasException e1) {
217      final DatanodeDescriptor newLocal = secondNode(localMachine, results);
218      if (newLocal != null) {
219        try {
220          return chooseRandom(
221              clusterMap.getNodeGroup(newLocal.getNetworkLocation()),
222              excludedNodes, blocksize, maxNodesPerRack, results,
223              avoidStaleNodes, storageTypes);
224        } catch(NotEnoughReplicasException e2) {
225          //otherwise randomly choose one from the network
226          return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
227              maxNodesPerRack, results, avoidStaleNodes, storageTypes);
228        }
229      } else {
230        //otherwise randomly choose one from the network
231        return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
232            maxNodesPerRack, results, avoidStaleNodes, storageTypes);
233      }
234    }
235  }
236
237  @Override
238  protected String getRack(final DatanodeInfo cur) {
239    String nodeGroupString = cur.getNetworkLocation();
240    return NetworkTopology.getFirstHalf(nodeGroupString);
241  }
242  
243  /**
244   * Find other nodes in the same nodegroup of <i>localMachine</i> and add them
245   * into <i>excludeNodes</i> as replica should not be duplicated for nodes 
246   * within the same nodegroup
247   * @return number of new excluded nodes
248   */
249  @Override
250  protected int addToExcludedNodes(DatanodeDescriptor chosenNode,
251      Set<Node> excludedNodes) {
252    int countOfExcludedNodes = 0;
253    String nodeGroupScope = chosenNode.getNetworkLocation();
254    List<Node> leafNodes = clusterMap.getLeaves(nodeGroupScope);
255    for (Node leafNode : leafNodes) {
256      if (excludedNodes.add(leafNode)) {
257        // not a existing node in excludedNodes
258        countOfExcludedNodes++;
259      }
260    }
261    
262    countOfExcludedNodes += addDependentNodesToExcludedNodes(
263        chosenNode, excludedNodes);
264    return countOfExcludedNodes;
265  }
266  
267  /**
268   * Add all nodes from a dependent nodes list to excludedNodes.
269   * @return number of new excluded nodes
270   */
271  private int addDependentNodesToExcludedNodes(DatanodeDescriptor chosenNode,
272      Set<Node> excludedNodes) {
273    if (this.host2datanodeMap == null) {
274      return 0;
275    }
276    int countOfExcludedNodes = 0;
277    for(String hostname : chosenNode.getDependentHostNames()) {
278      DatanodeDescriptor node =
279          this.host2datanodeMap.getDataNodeByHostName(hostname);
280      if(node!=null) {
281        if (excludedNodes.add(node)) {
282          countOfExcludedNodes++;
283        }
284      } else {
285        LOG.warn("Not able to find datanode " + hostname
286            + " which has dependency with datanode "
287            + chosenNode.getHostName());
288      }
289    }
290    
291    return countOfExcludedNodes;
292  }
293
294  /**
295   * Pick up replica node set for deleting replica as over-replicated. 
296   * First set contains replica nodes on rack with more than one
297   * replica while second set contains remaining replica nodes.
298   * If first is not empty, divide first set into two subsets:
299   *   moreThanOne contains nodes on nodegroup with more than one replica
300   *   exactlyOne contains the remaining nodes in first set
301   * then pickup priSet if not empty.
302   * If first is empty, then pick second.
303   */
304  @Override
305  public Collection<DatanodeStorageInfo> pickupReplicaSet(
306      Collection<DatanodeStorageInfo> first,
307      Collection<DatanodeStorageInfo> second,
308      Map<String, List<DatanodeStorageInfo>> rackMap) {
309    // If no replica within same rack, return directly.
310    if (first.isEmpty()) {
311      return second;
312    }
313    // Split data nodes in the first set into two sets, 
314    // moreThanOne contains nodes on nodegroup with more than one replica
315    // exactlyOne contains the remaining nodes
316    Map<String, List<DatanodeStorageInfo>> nodeGroupMap = 
317        new HashMap<String, List<DatanodeStorageInfo>>();
318    
319    for(DatanodeStorageInfo storage : first) {
320      final String nodeGroupName = NetworkTopology.getLastHalf(
321          storage.getDatanodeDescriptor().getNetworkLocation());
322      List<DatanodeStorageInfo> storageList = nodeGroupMap.get(nodeGroupName);
323      if (storageList == null) {
324        storageList = new ArrayList<DatanodeStorageInfo>();
325        nodeGroupMap.put(nodeGroupName, storageList);
326      }
327      storageList.add(storage);
328    }
329    
330    final List<DatanodeStorageInfo> moreThanOne = new ArrayList<DatanodeStorageInfo>();
331    final List<DatanodeStorageInfo> exactlyOne = new ArrayList<DatanodeStorageInfo>();
332    // split nodes into two sets
333    for(List<DatanodeStorageInfo> datanodeList : nodeGroupMap.values()) {
334      if (datanodeList.size() == 1 ) {
335        // exactlyOne contains nodes on nodegroup with exactly one replica
336        exactlyOne.add(datanodeList.get(0));
337      } else {
338        // moreThanOne contains nodes on nodegroup with more than one replica
339        moreThanOne.addAll(datanodeList);
340      }
341    }
342    
343    return moreThanOne.isEmpty()? exactlyOne : moreThanOne;
344  }
345  
346}