001/*
002 * ModeShape (http://www.modeshape.org)
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *       http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016package org.modeshape.common.text;
017
018import java.io.UnsupportedEncodingException;
019import java.text.CharacterIterator;
020import java.text.StringCharacterIterator;
021import java.util.BitSet;
022import org.modeshape.common.annotation.Immutable;
023
024/**
025 * An encoder useful for converting text to be used within a URL, as defined by Section 2.3 of <a
026 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>. Note that this class does not encode a complete URL (
027 * {@link java.net.URLEncoder} and {@link java.net.URLDecoder} should be used for such purposes).
028 */
029@Immutable
030public class UrlEncoder implements TextEncoder, TextDecoder {
031
032    public static final char ESCAPE_CHARACTER = '%';
033
034    /**
035     * Data characters that are allowed in a URI but do not have a reserved purpose are called unreserved. These include upper and
036     * lower case letters, decimal digits, and a limited set of punctuation marks and symbols.
037     * 
038     * <pre>
039     * unreserved  = alphanum | mark
040     * mark        = &quot;-&quot; | &quot;_&quot; | &quot;.&quot; | &quot;!&quot; | &quot;&tilde;&quot; | &quot;*&quot; | &quot;'&quot; | &quot;(&quot; | &quot;)&quot;
041     * </pre>
042     * 
043     * Unreserved characters can be escaped without changing the semantics of the URI, but this should not be done unless the URI
044     * is being used in a context that does not allow the unescaped character to appear.
045     */
046    private static final BitSet RFC2396_UNRESERVED_CHARACTERS = new BitSet(256);
047    private static final BitSet RFC2396_UNRESERVED_WITH_SLASH_CHARACTERS;
048
049    /**
050     * Lookup table which is used to determine, based on a hex char, how many bytes were needed in UTF-8 encoding to store that char
051     */
052    private static final byte[] BYTES_PER_CHAR = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4};
053
054    static {
055        RFC2396_UNRESERVED_CHARACTERS.set('a', 'z' + 1);
056        RFC2396_UNRESERVED_CHARACTERS.set('A', 'Z' + 1);
057        RFC2396_UNRESERVED_CHARACTERS.set('0', '9' + 1);
058        RFC2396_UNRESERVED_CHARACTERS.set('-');
059        RFC2396_UNRESERVED_CHARACTERS.set('_');
060        RFC2396_UNRESERVED_CHARACTERS.set('.');
061        RFC2396_UNRESERVED_CHARACTERS.set('!');
062        RFC2396_UNRESERVED_CHARACTERS.set('~');
063        RFC2396_UNRESERVED_CHARACTERS.set('*');
064        RFC2396_UNRESERVED_CHARACTERS.set('\'');
065        RFC2396_UNRESERVED_CHARACTERS.set('(');
066        RFC2396_UNRESERVED_CHARACTERS.set(')');
067
068        RFC2396_UNRESERVED_WITH_SLASH_CHARACTERS = (BitSet)RFC2396_UNRESERVED_CHARACTERS.clone();
069        RFC2396_UNRESERVED_WITH_SLASH_CHARACTERS.set('/');
070    }
071
072    private boolean slashEncoded = true;
073
074    @Override
075    public String encode( String text ) {
076        if (text == null) return null;
077        if (text.length() == 0) return text;
078        return encode(text, isSlashEncoded() ? RFC2396_UNRESERVED_CHARACTERS : RFC2396_UNRESERVED_WITH_SLASH_CHARACTERS);
079    }
080
081    protected String encode( String text,
082                             BitSet safeChars ) {
083        final StringBuilder result = new StringBuilder();
084        final CharacterIterator iter = new StringCharacterIterator(text);
085        for (char c = iter.first(); c != CharacterIterator.DONE; c = iter.next()) {
086            if (safeChars.get(c)) {
087                // Safe character, so just pass through ...
088                result.append(c);
089            } else {
090                try {
091                    // The character is not a safe character, and must be escaped in UTF-8 form (see http://tools.ietf.org/html/rfc3629)
092                    byte[] utf8Bytes = Character.toString(c).getBytes("UTF-8");
093                    for (byte utf8Byte : utf8Bytes) {
094                        result.append(ESCAPE_CHARACTER);
095                        int high = (utf8Byte & 0xf0) >> 4;
096                        int low = utf8Byte & 0x0f;
097                        result.append(Integer.toHexString(high));
098                        result.append(Integer.toHexString(low));
099                    }
100                } catch (UnsupportedEncodingException e) {
101                    //should never happen
102                    throw new IllegalStateException(e);
103                }
104            }
105        }
106        return result.toString();
107    }
108
109    @Override
110    public String decode( String encodedText ) {
111        if (encodedText == null) return null;
112        if (encodedText.length() == 0) return encodedText;
113        final StringBuilder result = new StringBuilder();
114        final CharacterIterator iter = new StringCharacterIterator(encodedText);
115        byte[] escapedCharBytes = new byte[4];
116        int byteIdx = 0;
117        int bytesPerChar = -1;
118        for (char c = iter.first(); c != CharacterIterator.DONE; c = iter.next()) {
119            if (c == ESCAPE_CHARACTER) {
120                boolean foundEscapedCharacter = false;
121                // Found the first character in a potential escape sequence, so grab the next two characters ...
122                char hexChar1 = iter.next();
123                char hexChar2 = hexChar1 != CharacterIterator.DONE ? iter.next() : CharacterIterator.DONE;
124                if (hexChar2 != CharacterIterator.DONE) {
125                    // We found two more characters, but ensure they form a valid hexadecimal number ...
126                    int hexNum1 = Character.digit(hexChar1, 16);
127                    int hexNum2 = Character.digit(hexChar2, 16);
128                    if (hexNum1 > -1 && hexNum2 > -1) {
129                        foundEscapedCharacter = true;
130                        //since we're dealing with UTF-8, we need to figure out how many bytes were used to encode the original
131                        //character by reading the number of leading 1 bits from the 1st high order byte
132                        if (bytesPerChar == -1) {
133                            bytesPerChar = BYTES_PER_CHAR[hexNum1];
134                        }
135                        //record the next byte into the array
136                        escapedCharBytes[byteIdx++] = (byte) (hexNum1 * 16 + hexNum2);
137                        if (byteIdx == bytesPerChar) {
138                            //we've filled the buffer of bytes
139                            try {
140                                result.append(new String(escapedCharBytes, 0, bytesPerChar, "UTF-8"));
141                            } catch (UnsupportedEncodingException e) {
142                                //should never happen
143                                throw new IllegalStateException(e);
144                            }
145                            byteIdx = 0;
146                            bytesPerChar = -1;
147                        }
148                    }
149                }
150                if (!foundEscapedCharacter) {
151                    result.append(c);
152                    if (hexChar1 != CharacterIterator.DONE) result.append(hexChar1);
153                    if (hexChar2 != CharacterIterator.DONE) result.append(hexChar2);
154                }
155            } else {
156                result.append(c);
157            }
158        }
159        return result.toString();
160    }
161
162    /**
163     * @return slashEncoded
164     */
165    public boolean isSlashEncoded() {
166        return this.slashEncoded;
167    }
168
169    /**
170     * @param slashEncoded Sets slashEncoded to the specified value.
171     * @return this object, for method chaining
172     */
173    public UrlEncoder setSlashEncoded( boolean slashEncoded ) {
174        this.slashEncoded = slashEncoded;
175        return this;
176    }
177
178}