001/*
002 * ModeShape (http://www.modeshape.org)
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *       http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016package org.modeshape.common.xml;
017
018import java.text.CharacterIterator;
019import java.text.StringCharacterIterator;
020import org.modeshape.common.annotation.Immutable;
021
022/**
023 * A utility class for determining the validity of various XML names, per the <a href="http://www.w3.org/TR/REC-xml/">XML 1.0
024 * Specification</a>.
025 */
026@Immutable
027public class XmlCharacters {
028
029    private static final int NUMBER_OF_CHARACTERS = 1 << 16; // 65536 or 0x10000
030
031    /**
032     * This implementation uses an array that captures for each character the XML classifications. An array is used because it is
033     * a fast way of looking up each character.
034     */
035    private static final char[] MASKS = new char[NUMBER_OF_CHARACTERS];
036
037    private static final int VALID_CHARACTER = 1;
038    private static final int CONTENT_CHARACTER = 1 << 1;
039    private static final int SPACE_CHARACTER = 1 << 2;
040    private static final int NAME_START_CHARACTER = 1 << 3;
041    private static final int NAME_CHARACTER = 1 << 4;
042    private static final int NCNAME_START_CHARACTER = 1 << 5;
043    private static final int NCNAME_CHARACTER = 1 << 6;
044    private static final int PUBID_CHARACTER = 1 << 7;
045
046    static {
047
048        // ----------------
049        // Valid Characters
050        // ----------------
051        // [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
052        // See http://www.w3.org/TR/REC-xml/#charsets
053        MASKS[0x9] |= VALID_CHARACTER | CONTENT_CHARACTER;
054        MASKS[0xA] |= VALID_CHARACTER | CONTENT_CHARACTER;
055        MASKS[0xD] |= VALID_CHARACTER | CONTENT_CHARACTER;
056        for (int i = 0x20; i <= 0xD7FF; ++i)
057            MASKS[i] |= VALID_CHARACTER | CONTENT_CHARACTER;
058        for (int i = 0xE000; i <= 0xFFFD; ++i)
059            MASKS[i] |= VALID_CHARACTER | CONTENT_CHARACTER;
060        // Last range is bigger than our character array, so we'll handle in the 'isValid' method ...
061        // for ( int i=0x10000; i<=0x10FFFF; ++i ) MASKS[i] = VALID_CHARACTER_MASK | CONTENT_CHARACTER;
062
063        // Remove the other characters that are not allowed in XML content:
064        // '<', '&', '\n', '\r', ']'
065        MASKS['<'] &= ~(CONTENT_CHARACTER);
066        MASKS['&'] &= ~(CONTENT_CHARACTER);
067        MASKS['\n'] &= ~(CONTENT_CHARACTER);
068        MASKS['\r'] &= ~(CONTENT_CHARACTER);
069        MASKS[']'] &= ~(CONTENT_CHARACTER);
070
071        // ---------------------
072        // Whitespace Characters
073        // ---------------------
074        // [3] S ::= (#x20 | #x9 | #xD | #xA)+
075        // See http://www.w3.org/TR/REC-xml/#sec-common-syn
076        MASKS[0x20] |= SPACE_CHARACTER;
077        MASKS[0x9] |= SPACE_CHARACTER;
078        MASKS[0xA] |= SPACE_CHARACTER;
079        MASKS[0xD] |= SPACE_CHARACTER;
080
081        // ---------------------
082        // Name Start Characters
083        // ---------------------
084        // [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] |
085        // [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] |
086        // [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] |
087        // [#x10000-#xEFFFF]
088        // See http://www.w3.org/TR/REC-xml/#sec-common-syn
089        //
090        // Note that all these start characters AND characters are valid for NAME and NCNAME
091        int nameStartMask = NAME_START_CHARACTER | NCNAME_START_CHARACTER | NAME_CHARACTER | NCNAME_CHARACTER;
092        MASKS[':'] |= nameStartMask;
093        MASKS['_'] |= nameStartMask;
094        for (int i = 'A'; i <= 'Z'; ++i)
095            MASKS[i] |= nameStartMask;
096        for (int i = 'a'; i <= 'z'; ++i)
097            MASKS[i] |= nameStartMask;
098        for (int i = 0xC0; i <= 0xD6; ++i)
099            MASKS[i] |= nameStartMask;
100        for (int i = 0xD8; i <= 0xF6; ++i)
101            MASKS[i] |= nameStartMask;
102        for (int i = 0xF8; i <= 0x2FF; ++i)
103            MASKS[i] |= nameStartMask;
104        for (int i = 0x370; i <= 0x37D; ++i)
105            MASKS[i] |= nameStartMask;
106        for (int i = 0x37F; i <= 0x1FFF; ++i)
107            MASKS[i] |= nameStartMask;
108        for (int i = 0x200C; i <= 0x200D; ++i)
109            MASKS[i] |= nameStartMask;
110        for (int i = 0x2070; i <= 0x218F; ++i)
111            MASKS[i] |= nameStartMask;
112        for (int i = 0x2C00; i <= 0x2FEF; ++i)
113            MASKS[i] |= nameStartMask;
114        for (int i = 0x3001; i <= 0xD7FF; ++i)
115            MASKS[i] |= nameStartMask;
116        for (int i = 0xF900; i <= 0xFDCF; ++i)
117            MASKS[i] |= nameStartMask;
118        for (int i = 0xFDF0; i <= 0xFFFD; ++i)
119            MASKS[i] |= nameStartMask;
120        // Last range is bigger than our character array ...
121        // for (int i = 0x10000; i <= 0xEFFFF; ++i) MASKS[i] |= nameStartMask;
122
123        // ---------------
124        // Name Characters
125        // ---------------
126        // [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
127        // See http://www.w3.org/TR/REC-xml/#sec-common-syn
128        //
129        // Note that all these characters are valid for NAME and NCNAME
130        int nameMask = NAME_CHARACTER | NCNAME_CHARACTER;
131        MASKS['-'] |= nameMask;
132        MASKS['.'] |= nameMask;
133        MASKS[0xB7] |= nameMask;
134        for (int i = '0'; i <= '9'; ++i)
135            MASKS[i] |= nameMask;
136        for (int i = 0x0300; i <= 0x036F; ++i)
137            MASKS[i] |= nameStartMask;
138        for (int i = 0x203F; i <= 0x2040; ++i)
139            MASKS[i] |= nameStartMask;
140
141        // --------
142        // NC Names
143        // --------
144        // [4] NCName ::= NCNameStartChar NCNameChar*
145        // which is just an XML Name, minus the ":"
146        // See http://www.w3.org/TR/REC-xml-names/#ns-decl
147        // So, remove the NCNAME_CHARACTER and NCNAME_START_CHARACTER masks from ':' ...
148        MASKS[':'] &= ~(NCNAME_START_CHARACTER | NCNAME_CHARACTER);
149
150        // --------------------
151        // Public ID characters
152        // --------------------
153        // [13] PubidChar ::= #x20 | #xD | #xA | [a-zA-Z0-9] | [-'()+,./:=?;!*#@$_%]
154        MASKS[0x20] |= PUBID_CHARACTER;
155        MASKS[0xA] |= PUBID_CHARACTER;
156        MASKS[0xD] |= PUBID_CHARACTER;
157        for (int i = 'A'; i <= 'Z'; ++i)
158            MASKS[i] |= PUBID_CHARACTER;
159        for (int i = 'a'; i <= 'z'; ++i)
160            MASKS[i] |= PUBID_CHARACTER;
161        for (int i = '0'; i <= '9'; ++i)
162            MASKS[i] |= PUBID_CHARACTER;
163        MASKS['-'] |= PUBID_CHARACTER;
164        MASKS['\''] |= PUBID_CHARACTER;
165        MASKS['('] |= PUBID_CHARACTER;
166        MASKS[')'] |= PUBID_CHARACTER;
167        MASKS['+'] |= PUBID_CHARACTER;
168        MASKS[','] |= PUBID_CHARACTER;
169        MASKS['.'] |= PUBID_CHARACTER;
170        MASKS['/'] |= PUBID_CHARACTER;
171        MASKS[':'] |= PUBID_CHARACTER;
172        MASKS['='] |= PUBID_CHARACTER;
173        MASKS['?'] |= PUBID_CHARACTER;
174        MASKS[';'] |= PUBID_CHARACTER;
175        MASKS['!'] |= PUBID_CHARACTER;
176        MASKS['*'] |= PUBID_CHARACTER;
177        MASKS['#'] |= PUBID_CHARACTER;
178        MASKS['@'] |= PUBID_CHARACTER;
179        MASKS['$'] |= PUBID_CHARACTER;
180        MASKS['_'] |= PUBID_CHARACTER;
181        MASKS['%'] |= PUBID_CHARACTER;
182
183    }
184
185    private XmlCharacters() {
186    }
187
188    /**
189     * Determine whether the supplied character is a valid first character in an XML Name. The first character in an XML name is
190     * more restrictive than the {@link #isValidName(int) remaining characters}.
191     * 
192     * @param c the character
193     * @return true if the character is valid for an XML Name's first character
194     */
195    public static boolean isValidNameStart( int c ) {
196        return c < NUMBER_OF_CHARACTERS && (MASKS[c] & NAME_START_CHARACTER) != 0;
197    }
198
199    /**
200     * Determine whether the supplied character is a valid first character in an XML NCName. The first character in an XML NCName
201     * is more restrictive than the {@link #isValidName(int) remaining characters}.
202     * 
203     * @param c the character
204     * @return true if the character is valid for an XML NCName's first character
205     */
206    public static boolean isValidNcNameStart( int c ) {
207        return c < NUMBER_OF_CHARACTERS && (MASKS[c] & NCNAME_START_CHARACTER) != 0;
208    }
209
210    /**
211     * Determine whether the supplied character is a valid non-first character in an XML Name. The {@link #isValidNameStart(int)
212     * first character} in an XML name is more restrictive than the remaining characters.
213     * 
214     * @param c the character
215     * @return true if the character is valid character in an XML Name
216     */
217    public static boolean isValidName( int c ) {
218        return c < NUMBER_OF_CHARACTERS && (MASKS[c] & NAME_CHARACTER) != 0;
219    }
220
221    /**
222     * Determine whether the supplied character is a valid non-first character in an XML NCName. The
223     * {@link #isValidNcNameStart(int) first character} in an XML NCName is more restrictive than the remaining characters.
224     * 
225     * @param c the character
226     * @return true if the character is valid character in an XML NCName
227     */
228    public static boolean isValidNcName( int c ) {
229        return c < NUMBER_OF_CHARACTERS && (MASKS[c] & NCNAME_CHARACTER) != 0;
230    }
231
232    /**
233     * Determine whether the supplied character is a valid character in an XML Pubid.
234     * 
235     * @param c the character
236     * @return true if the character is valid character in an XML Pubid
237     */
238    public static boolean isValidPubid( int c ) {
239        return c < NUMBER_OF_CHARACTERS && (MASKS[c] & PUBID_CHARACTER) != 0;
240    }
241
242    /**
243     * Determine whether the supplied character is a valid character in XML.
244     * 
245     * @param c the character
246     * @return true if the character is valid character in XML
247     */
248    public static boolean isValid( int c ) {
249        return (c < NUMBER_OF_CHARACTERS && (MASKS[c] & VALID_CHARACTER) != 0) || (0x10000 <= c && c <= 0x10FFFF);
250    }
251
252    /**
253     * Determine whether the supplied character is a valid character in XML content
254     * 
255     * @param c the character
256     * @return true if the character is valid character in XML content
257     */
258    public static boolean isValidContent( int c ) {
259        return (c < NUMBER_OF_CHARACTERS && (MASKS[c] & CONTENT_CHARACTER) != 0) || (0x10000 <= c && c <= 0x10FFFF);
260    }
261
262    /**
263     * Determine whether the supplied character is a valid whitespace character in XML
264     * 
265     * @param c the character
266     * @return true if the character is valid whitespace character in XML
267     */
268    public static boolean isValidSpace( int c ) {
269        return c <= 0x20 && (MASKS[c] & SPACE_CHARACTER) != 0;
270    }
271
272    /**
273     * Determine if the supplied name is a valid XML Name.
274     * 
275     * @param name the string being checked
276     * @return true if the supplied name is indeed a valid XML Name, or false otherwise
277     */
278    public static boolean isValidName( String name ) {
279        if (name == null || name.length() == 0) return false;
280        CharacterIterator iter = new StringCharacterIterator(name);
281        char c = iter.first();
282        if (!isValidNameStart(c)) return false;
283        while (c != CharacterIterator.DONE) {
284            if (!isValidName(c)) return false;
285            c = iter.next();
286        }
287        return true;
288    }
289
290    /**
291     * Determine if the supplied name is a valid XML NCName.
292     * 
293     * @param name the string being checked
294     * @return true if the supplied name is indeed a valid XML NCName, or false otherwise
295     */
296    public static boolean isValidNcName( String name ) {
297        if (name == null || name.length() == 0) return false;
298        CharacterIterator iter = new StringCharacterIterator(name);
299        char c = iter.first();
300        if (!isValidNcNameStart(c)) return false;
301        while (c != CharacterIterator.DONE) {
302            if (!isValidNcName(c)) return false;
303            c = iter.next();
304        }
305        return true;
306    }
307}