001/* 002 * ModeShape (http://www.modeshape.org) 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package org.modeshape.common.text; 017 018import java.io.UnsupportedEncodingException; 019import java.text.CharacterIterator; 020import java.text.StringCharacterIterator; 021import java.util.BitSet; 022import org.modeshape.common.annotation.Immutable; 023 024/** 025 * An encoder useful for converting text to be used within a URL, as defined by Section 2.3 of <a 026 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>. Note that this class does not encode a complete URL ( 027 * {@link java.net.URLEncoder} and {@link java.net.URLDecoder} should be used for such purposes). 028 */ 029@Immutable 030public class UrlEncoder implements TextEncoder, TextDecoder { 031 032 public static final char ESCAPE_CHARACTER = '%'; 033 034 /** 035 * Data characters that are allowed in a URI but do not have a reserved purpose are called unreserved. These include upper and 036 * lower case letters, decimal digits, and a limited set of punctuation marks and symbols. 037 * 038 * <pre> 039 * unreserved = alphanum | mark 040 * mark = "-" | "_" | "." | "!" | "˜" | "*" | "'" | "(" | ")" 041 * </pre> 042 * 043 * Unreserved characters can be escaped without changing the semantics of the URI, but this should not be done unless the URI 044 * is being used in a context that does not allow the unescaped character to appear. 045 */ 046 private static final BitSet RFC2396_UNRESERVED_CHARACTERS = new BitSet(256); 047 private static final BitSet RFC2396_UNRESERVED_WITH_SLASH_CHARACTERS; 048 049 /** 050 * Lookup table which is used to determine, based on a hex char, how many bytes were needed in UTF-8 encoding to store that char 051 */ 052 private static final byte[] BYTES_PER_CHAR = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4}; 053 054 static { 055 RFC2396_UNRESERVED_CHARACTERS.set('a', 'z' + 1); 056 RFC2396_UNRESERVED_CHARACTERS.set('A', 'Z' + 1); 057 RFC2396_UNRESERVED_CHARACTERS.set('0', '9' + 1); 058 RFC2396_UNRESERVED_CHARACTERS.set('-'); 059 RFC2396_UNRESERVED_CHARACTERS.set('_'); 060 RFC2396_UNRESERVED_CHARACTERS.set('.'); 061 RFC2396_UNRESERVED_CHARACTERS.set('!'); 062 RFC2396_UNRESERVED_CHARACTERS.set('~'); 063 RFC2396_UNRESERVED_CHARACTERS.set('*'); 064 RFC2396_UNRESERVED_CHARACTERS.set('\''); 065 RFC2396_UNRESERVED_CHARACTERS.set('('); 066 RFC2396_UNRESERVED_CHARACTERS.set(')'); 067 068 RFC2396_UNRESERVED_WITH_SLASH_CHARACTERS = (BitSet)RFC2396_UNRESERVED_CHARACTERS.clone(); 069 RFC2396_UNRESERVED_WITH_SLASH_CHARACTERS.set('/'); 070 } 071 072 private boolean slashEncoded = true; 073 074 @Override 075 public String encode( String text ) { 076 if (text == null) return null; 077 if (text.length() == 0) return text; 078 return encode(text, isSlashEncoded() ? RFC2396_UNRESERVED_CHARACTERS : RFC2396_UNRESERVED_WITH_SLASH_CHARACTERS); 079 } 080 081 protected String encode( String text, 082 BitSet safeChars ) { 083 final StringBuilder result = new StringBuilder(); 084 final CharacterIterator iter = new StringCharacterIterator(text); 085 for (char c = iter.first(); c != CharacterIterator.DONE; c = iter.next()) { 086 if (safeChars.get(c)) { 087 // Safe character, so just pass through ... 088 result.append(c); 089 } else { 090 try { 091 // The character is not a safe character, and must be escaped in UTF-8 form (see http://tools.ietf.org/html/rfc3629) 092 byte[] utf8Bytes = Character.toString(c).getBytes("UTF-8"); 093 for (byte utf8Byte : utf8Bytes) { 094 result.append(ESCAPE_CHARACTER); 095 int high = (utf8Byte & 0xf0) >> 4; 096 int low = utf8Byte & 0x0f; 097 result.append(Integer.toHexString(high)); 098 result.append(Integer.toHexString(low)); 099 } 100 } catch (UnsupportedEncodingException e) { 101 //should never happen 102 throw new IllegalStateException(e); 103 } 104 } 105 } 106 return result.toString(); 107 } 108 109 @Override 110 public String decode( String encodedText ) { 111 if (encodedText == null) return null; 112 if (encodedText.length() == 0) return encodedText; 113 final StringBuilder result = new StringBuilder(); 114 final CharacterIterator iter = new StringCharacterIterator(encodedText); 115 byte[] escapedCharBytes = new byte[4]; 116 int byteIdx = 0; 117 int bytesPerChar = -1; 118 for (char c = iter.first(); c != CharacterIterator.DONE; c = iter.next()) { 119 if (c == ESCAPE_CHARACTER) { 120 boolean foundEscapedCharacter = false; 121 // Found the first character in a potential escape sequence, so grab the next two characters ... 122 char hexChar1 = iter.next(); 123 char hexChar2 = hexChar1 != CharacterIterator.DONE ? iter.next() : CharacterIterator.DONE; 124 if (hexChar2 != CharacterIterator.DONE) { 125 // We found two more characters, but ensure they form a valid hexadecimal number ... 126 int hexNum1 = Character.digit(hexChar1, 16); 127 int hexNum2 = Character.digit(hexChar2, 16); 128 if (hexNum1 > -1 && hexNum2 > -1) { 129 foundEscapedCharacter = true; 130 //since we're dealing with UTF-8, we need to figure out how many bytes were used to encode the original 131 //character by reading the number of leading 1 bits from the 1st high order byte 132 if (bytesPerChar == -1) { 133 bytesPerChar = BYTES_PER_CHAR[hexNum1]; 134 } 135 //record the next byte into the array 136 escapedCharBytes[byteIdx++] = (byte) (hexNum1 * 16 + hexNum2); 137 if (byteIdx == bytesPerChar) { 138 //we've filled the buffer of bytes 139 try { 140 result.append(new String(escapedCharBytes, 0, bytesPerChar, "UTF-8")); 141 } catch (UnsupportedEncodingException e) { 142 //should never happen 143 throw new IllegalStateException(e); 144 } 145 byteIdx = 0; 146 bytesPerChar = -1; 147 } 148 } 149 } 150 if (!foundEscapedCharacter) { 151 result.append(c); 152 if (hexChar1 != CharacterIterator.DONE) result.append(hexChar1); 153 if (hexChar2 != CharacterIterator.DONE) result.append(hexChar2); 154 } 155 } else { 156 result.append(c); 157 } 158 } 159 return result.toString(); 160 } 161 162 /** 163 * @return slashEncoded 164 */ 165 public boolean isSlashEncoded() { 166 return this.slashEncoded; 167 } 168 169 /** 170 * @param slashEncoded Sets slashEncoded to the specified value. 171 * @return this object, for method chaining 172 */ 173 public UrlEncoder setSlashEncoded( boolean slashEncoded ) { 174 this.slashEncoded = slashEncoded; 175 return this; 176 } 177 178}