001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs.server.datanode.fsdataset;
019
020import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_THRESHOLD_DEFAULT;
021import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_THRESHOLD_KEY;
022import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_PREFERENCE_FRACTION_DEFAULT;
023import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_PREFERENCE_FRACTION_KEY;
024
025import java.io.IOException;
026import java.util.ArrayList;
027import java.util.List;
028import java.util.Random;
029
030import org.apache.commons.logging.Log;
031import org.apache.commons.logging.LogFactory;
032import org.apache.hadoop.conf.Configurable;
033import org.apache.hadoop.conf.Configuration;
034import org.apache.hadoop.util.DiskChecker.DiskOutOfSpaceException;
035
036/**
037 * A DN volume choosing policy which takes into account the amount of free
038 * space on each of the available volumes when considering where to assign a
039 * new replica allocation. By default this policy prefers assigning replicas to
040 * those volumes with more available free space, so as to over time balance the
041 * available space of all the volumes within a DN.
042 */
043public class AvailableSpaceVolumeChoosingPolicy<V extends FsVolumeSpi>
044    implements VolumeChoosingPolicy<V>, Configurable {
045  
046  private static final Log LOG = LogFactory.getLog(AvailableSpaceVolumeChoosingPolicy.class);
047  
048  private final Random random;
049  
050  private long balancedSpaceThreshold = DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_THRESHOLD_DEFAULT;
051  private float balancedPreferencePercent = DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_PREFERENCE_FRACTION_DEFAULT;
052
053  AvailableSpaceVolumeChoosingPolicy(Random random) {
054    this.random = random;
055  }
056
057  public AvailableSpaceVolumeChoosingPolicy() {
058    this(new Random());
059  }
060
061  @Override
062  public synchronized void setConf(Configuration conf) {
063    balancedSpaceThreshold = conf.getLong(
064        DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_THRESHOLD_KEY,
065        DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_THRESHOLD_DEFAULT);
066    balancedPreferencePercent = conf.getFloat(
067        DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_PREFERENCE_FRACTION_KEY,
068        DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_PREFERENCE_FRACTION_DEFAULT);
069    
070    LOG.info("Available space volume choosing policy initialized: " +
071        DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_THRESHOLD_KEY +
072        " = " + balancedSpaceThreshold + ", " +
073        DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_PREFERENCE_FRACTION_KEY +
074        " = " + balancedPreferencePercent);
075
076    if (balancedPreferencePercent > 1.0) {
077      LOG.warn("The value of " + DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_PREFERENCE_FRACTION_KEY +
078               " is greater than 1.0 but should be in the range 0.0 - 1.0");
079    }
080
081    if (balancedPreferencePercent < 0.5) {
082      LOG.warn("The value of " + DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_PREFERENCE_FRACTION_KEY +
083               " is less than 0.5 so volumes with less available disk space will receive more block allocations");
084    }
085  }
086  
087  @Override
088  public synchronized Configuration getConf() {
089    // Nothing to do. Only added to fulfill the Configurable contract.
090    return null;
091  }
092  
093  private final VolumeChoosingPolicy<V> roundRobinPolicyBalanced =
094      new RoundRobinVolumeChoosingPolicy<V>();
095  private final VolumeChoosingPolicy<V> roundRobinPolicyHighAvailable =
096      new RoundRobinVolumeChoosingPolicy<V>();
097  private final VolumeChoosingPolicy<V> roundRobinPolicyLowAvailable =
098      new RoundRobinVolumeChoosingPolicy<V>();
099
100  @Override
101  public synchronized V chooseVolume(List<V> volumes,
102      long replicaSize) throws IOException {
103    if (volumes.size() < 1) {
104      throw new DiskOutOfSpaceException("No more available volumes");
105    }
106    
107    AvailableSpaceVolumeList volumesWithSpaces =
108        new AvailableSpaceVolumeList(volumes);
109    
110    if (volumesWithSpaces.areAllVolumesWithinFreeSpaceThreshold()) {
111      // If they're actually not too far out of whack, fall back on pure round
112      // robin.
113      V volume = roundRobinPolicyBalanced.chooseVolume(volumes, replicaSize);
114      if (LOG.isDebugEnabled()) {
115        LOG.debug("All volumes are within the configured free space balance " +
116            "threshold. Selecting " + volume + " for write of block size " +
117            replicaSize);
118      }
119      return volume;
120    } else {
121      V volume = null;
122      // If none of the volumes with low free space have enough space for the
123      // replica, always try to choose a volume with a lot of free space.
124      long mostAvailableAmongLowVolumes = volumesWithSpaces
125          .getMostAvailableSpaceAmongVolumesWithLowAvailableSpace();
126      
127      List<V> highAvailableVolumes = extractVolumesFromPairs(
128          volumesWithSpaces.getVolumesWithHighAvailableSpace());
129      List<V> lowAvailableVolumes = extractVolumesFromPairs(
130          volumesWithSpaces.getVolumesWithLowAvailableSpace());
131      
132      float preferencePercentScaler =
133          (highAvailableVolumes.size() * balancedPreferencePercent) +
134          (lowAvailableVolumes.size() * (1 - balancedPreferencePercent));
135      float scaledPreferencePercent =
136          (highAvailableVolumes.size() * balancedPreferencePercent) /
137          preferencePercentScaler;
138      if (mostAvailableAmongLowVolumes < replicaSize ||
139          random.nextFloat() < scaledPreferencePercent) {
140        volume = roundRobinPolicyHighAvailable.chooseVolume(
141            highAvailableVolumes, replicaSize);
142        if (LOG.isDebugEnabled()) {
143          LOG.debug("Volumes are imbalanced. Selecting " + volume +
144              " from high available space volumes for write of block size "
145              + replicaSize);
146        }
147      } else {
148        volume = roundRobinPolicyLowAvailable.chooseVolume(
149            lowAvailableVolumes, replicaSize);
150        if (LOG.isDebugEnabled()) {
151          LOG.debug("Volumes are imbalanced. Selecting " + volume +
152              " from low available space volumes for write of block size "
153              + replicaSize);
154        }
155      }
156      return volume;
157    }
158  }
159  
160  /**
161   * Used to keep track of the list of volumes we're choosing from.
162   */
163  private class AvailableSpaceVolumeList {
164    private final List<AvailableSpaceVolumePair> volumes;
165    
166    public AvailableSpaceVolumeList(List<V> volumes) throws IOException {
167      this.volumes = new ArrayList<AvailableSpaceVolumePair>();
168      for (V volume : volumes) {
169        this.volumes.add(new AvailableSpaceVolumePair(volume));
170      }
171    }
172    
173    /**
174     * @return true if all volumes' free space is within the
175     *         configured threshold, false otherwise.
176     */
177    public boolean areAllVolumesWithinFreeSpaceThreshold() {
178      long leastAvailable = Long.MAX_VALUE;
179      long mostAvailable = 0;
180      for (AvailableSpaceVolumePair volume : volumes) {
181        leastAvailable = Math.min(leastAvailable, volume.getAvailable());
182        mostAvailable = Math.max(mostAvailable, volume.getAvailable());
183      }
184      return (mostAvailable - leastAvailable) < balancedSpaceThreshold;
185    }
186    
187    /**
188     * @return the minimum amount of space available on a single volume,
189     *         across all volumes.
190     */
191    private long getLeastAvailableSpace() {
192      long leastAvailable = Long.MAX_VALUE;
193      for (AvailableSpaceVolumePair volume : volumes) {
194        leastAvailable = Math.min(leastAvailable, volume.getAvailable());
195      }
196      return leastAvailable;
197    }
198    
199    /**
200     * @return the maximum amount of space available across volumes with low space.
201     */
202    public long getMostAvailableSpaceAmongVolumesWithLowAvailableSpace() {
203      long mostAvailable = Long.MIN_VALUE;
204      for (AvailableSpaceVolumePair volume : getVolumesWithLowAvailableSpace()) {
205        mostAvailable = Math.max(mostAvailable, volume.getAvailable());
206      }
207      return mostAvailable;
208    }
209    
210    /**
211     * @return the list of volumes with relatively low available space.
212     */
213    public List<AvailableSpaceVolumePair> getVolumesWithLowAvailableSpace() {
214      long leastAvailable = getLeastAvailableSpace();
215      List<AvailableSpaceVolumePair> ret = new ArrayList<AvailableSpaceVolumePair>();
216      for (AvailableSpaceVolumePair volume : volumes) {
217        if (volume.getAvailable() <= leastAvailable + balancedSpaceThreshold) {
218          ret.add(volume);
219        }
220      }
221      return ret;
222    }
223    
224    /**
225     * @return the list of volumes with a lot of available space.
226     */
227    public List<AvailableSpaceVolumePair> getVolumesWithHighAvailableSpace() {
228      long leastAvailable = getLeastAvailableSpace();
229      List<AvailableSpaceVolumePair> ret = new ArrayList<AvailableSpaceVolumePair>();
230      for (AvailableSpaceVolumePair volume : volumes) {
231        if (volume.getAvailable() > leastAvailable + balancedSpaceThreshold) {
232          ret.add(volume);
233        }
234      }
235      return ret;
236    }
237    
238  }
239  
240  /**
241   * Used so that we only check the available space on a given volume once, at
242   * the beginning of {@link AvailableSpaceVolumeChoosingPolicy#chooseVolume(List, long)}.
243   */
244  private class AvailableSpaceVolumePair {
245    private final V volume;
246    private final long availableSpace;
247    
248    public AvailableSpaceVolumePair(V volume) throws IOException {
249      this.volume = volume;
250      this.availableSpace = volume.getAvailable();
251    }
252    
253    public long getAvailable() {
254      return availableSpace;
255    }
256    
257    public V getVolume() {
258      return volume;
259    }
260  }
261  
262  private List<V> extractVolumesFromPairs(List<AvailableSpaceVolumePair> volumes) {
263    List<V> ret = new ArrayList<V>();
264    for (AvailableSpaceVolumePair volume : volumes) {
265      ret.add(volume.getVolume());
266    }
267    return ret;
268  }
269
270}