001/* 002 * ModeShape (http://www.modeshape.org) 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package org.modeshape.common.xml; 017 018import java.text.CharacterIterator; 019import java.text.StringCharacterIterator; 020import org.modeshape.common.annotation.Immutable; 021 022/** 023 * A utility class for determining the validity of various XML names, per the <a href="http://www.w3.org/TR/REC-xml/">XML 1.0 024 * Specification</a>. 025 */ 026@Immutable 027public class XmlCharacters { 028 029 private static final int NUMBER_OF_CHARACTERS = 1 << 16; // 65536 or 0x10000 030 031 /** 032 * This implementation uses an array that captures for each character the XML classifications. An array is used because it is 033 * a fast way of looking up each character. 034 */ 035 private static final char[] MASKS = new char[NUMBER_OF_CHARACTERS]; 036 037 private static final int VALID_CHARACTER = 1; 038 private static final int CONTENT_CHARACTER = 1 << 1; 039 private static final int SPACE_CHARACTER = 1 << 2; 040 private static final int NAME_START_CHARACTER = 1 << 3; 041 private static final int NAME_CHARACTER = 1 << 4; 042 private static final int NCNAME_START_CHARACTER = 1 << 5; 043 private static final int NCNAME_CHARACTER = 1 << 6; 044 private static final int PUBID_CHARACTER = 1 << 7; 045 046 static { 047 048 // ---------------- 049 // Valid Characters 050 // ---------------- 051 // [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] 052 // See http://www.w3.org/TR/REC-xml/#charsets 053 MASKS[0x9] |= VALID_CHARACTER | CONTENT_CHARACTER; 054 MASKS[0xA] |= VALID_CHARACTER | CONTENT_CHARACTER; 055 MASKS[0xD] |= VALID_CHARACTER | CONTENT_CHARACTER; 056 for (int i = 0x20; i <= 0xD7FF; ++i) 057 MASKS[i] |= VALID_CHARACTER | CONTENT_CHARACTER; 058 for (int i = 0xE000; i <= 0xFFFD; ++i) 059 MASKS[i] |= VALID_CHARACTER | CONTENT_CHARACTER; 060 // Last range is bigger than our character array, so we'll handle in the 'isValid' method ... 061 // for ( int i=0x10000; i<=0x10FFFF; ++i ) MASKS[i] = VALID_CHARACTER_MASK | CONTENT_CHARACTER; 062 063 // Remove the other characters that are not allowed in XML content: 064 // '<', '&', '\n', '\r', ']' 065 MASKS['<'] &= ~(CONTENT_CHARACTER); 066 MASKS['&'] &= ~(CONTENT_CHARACTER); 067 MASKS['\n'] &= ~(CONTENT_CHARACTER); 068 MASKS['\r'] &= ~(CONTENT_CHARACTER); 069 MASKS[']'] &= ~(CONTENT_CHARACTER); 070 071 // --------------------- 072 // Whitespace Characters 073 // --------------------- 074 // [3] S ::= (#x20 | #x9 | #xD | #xA)+ 075 // See http://www.w3.org/TR/REC-xml/#sec-common-syn 076 MASKS[0x20] |= SPACE_CHARACTER; 077 MASKS[0x9] |= SPACE_CHARACTER; 078 MASKS[0xA] |= SPACE_CHARACTER; 079 MASKS[0xD] |= SPACE_CHARACTER; 080 081 // --------------------- 082 // Name Start Characters 083 // --------------------- 084 // [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | 085 // [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | 086 // [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | 087 // [#x10000-#xEFFFF] 088 // See http://www.w3.org/TR/REC-xml/#sec-common-syn 089 // 090 // Note that all these start characters AND characters are valid for NAME and NCNAME 091 int nameStartMask = NAME_START_CHARACTER | NCNAME_START_CHARACTER | NAME_CHARACTER | NCNAME_CHARACTER; 092 MASKS[':'] |= nameStartMask; 093 MASKS['_'] |= nameStartMask; 094 for (int i = 'A'; i <= 'Z'; ++i) 095 MASKS[i] |= nameStartMask; 096 for (int i = 'a'; i <= 'z'; ++i) 097 MASKS[i] |= nameStartMask; 098 for (int i = 0xC0; i <= 0xD6; ++i) 099 MASKS[i] |= nameStartMask; 100 for (int i = 0xD8; i <= 0xF6; ++i) 101 MASKS[i] |= nameStartMask; 102 for (int i = 0xF8; i <= 0x2FF; ++i) 103 MASKS[i] |= nameStartMask; 104 for (int i = 0x370; i <= 0x37D; ++i) 105 MASKS[i] |= nameStartMask; 106 for (int i = 0x37F; i <= 0x1FFF; ++i) 107 MASKS[i] |= nameStartMask; 108 for (int i = 0x200C; i <= 0x200D; ++i) 109 MASKS[i] |= nameStartMask; 110 for (int i = 0x2070; i <= 0x218F; ++i) 111 MASKS[i] |= nameStartMask; 112 for (int i = 0x2C00; i <= 0x2FEF; ++i) 113 MASKS[i] |= nameStartMask; 114 for (int i = 0x3001; i <= 0xD7FF; ++i) 115 MASKS[i] |= nameStartMask; 116 for (int i = 0xF900; i <= 0xFDCF; ++i) 117 MASKS[i] |= nameStartMask; 118 for (int i = 0xFDF0; i <= 0xFFFD; ++i) 119 MASKS[i] |= nameStartMask; 120 // Last range is bigger than our character array ... 121 // for (int i = 0x10000; i <= 0xEFFFF; ++i) MASKS[i] |= nameStartMask; 122 123 // --------------- 124 // Name Characters 125 // --------------- 126 // [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040] 127 // See http://www.w3.org/TR/REC-xml/#sec-common-syn 128 // 129 // Note that all these characters are valid for NAME and NCNAME 130 int nameMask = NAME_CHARACTER | NCNAME_CHARACTER; 131 MASKS['-'] |= nameMask; 132 MASKS['.'] |= nameMask; 133 MASKS[0xB7] |= nameMask; 134 for (int i = '0'; i <= '9'; ++i) 135 MASKS[i] |= nameMask; 136 for (int i = 0x0300; i <= 0x036F; ++i) 137 MASKS[i] |= nameStartMask; 138 for (int i = 0x203F; i <= 0x2040; ++i) 139 MASKS[i] |= nameStartMask; 140 141 // -------- 142 // NC Names 143 // -------- 144 // [4] NCName ::= NCNameStartChar NCNameChar* 145 // which is just an XML Name, minus the ":" 146 // See http://www.w3.org/TR/REC-xml-names/#ns-decl 147 // So, remove the NCNAME_CHARACTER and NCNAME_START_CHARACTER masks from ':' ... 148 MASKS[':'] &= ~(NCNAME_START_CHARACTER | NCNAME_CHARACTER); 149 150 // -------------------- 151 // Public ID characters 152 // -------------------- 153 // [13] PubidChar ::= #x20 | #xD | #xA | [a-zA-Z0-9] | [-'()+,./:=?;!*#@$_%] 154 MASKS[0x20] |= PUBID_CHARACTER; 155 MASKS[0xA] |= PUBID_CHARACTER; 156 MASKS[0xD] |= PUBID_CHARACTER; 157 for (int i = 'A'; i <= 'Z'; ++i) 158 MASKS[i] |= PUBID_CHARACTER; 159 for (int i = 'a'; i <= 'z'; ++i) 160 MASKS[i] |= PUBID_CHARACTER; 161 for (int i = '0'; i <= '9'; ++i) 162 MASKS[i] |= PUBID_CHARACTER; 163 MASKS['-'] |= PUBID_CHARACTER; 164 MASKS['\''] |= PUBID_CHARACTER; 165 MASKS['('] |= PUBID_CHARACTER; 166 MASKS[')'] |= PUBID_CHARACTER; 167 MASKS['+'] |= PUBID_CHARACTER; 168 MASKS[','] |= PUBID_CHARACTER; 169 MASKS['.'] |= PUBID_CHARACTER; 170 MASKS['/'] |= PUBID_CHARACTER; 171 MASKS[':'] |= PUBID_CHARACTER; 172 MASKS['='] |= PUBID_CHARACTER; 173 MASKS['?'] |= PUBID_CHARACTER; 174 MASKS[';'] |= PUBID_CHARACTER; 175 MASKS['!'] |= PUBID_CHARACTER; 176 MASKS['*'] |= PUBID_CHARACTER; 177 MASKS['#'] |= PUBID_CHARACTER; 178 MASKS['@'] |= PUBID_CHARACTER; 179 MASKS['$'] |= PUBID_CHARACTER; 180 MASKS['_'] |= PUBID_CHARACTER; 181 MASKS['%'] |= PUBID_CHARACTER; 182 183 } 184 185 private XmlCharacters() { 186 } 187 188 /** 189 * Determine whether the supplied character is a valid first character in an XML Name. The first character in an XML name is 190 * more restrictive than the {@link #isValidName(int) remaining characters}. 191 * 192 * @param c the character 193 * @return true if the character is valid for an XML Name's first character 194 */ 195 public static boolean isValidNameStart( int c ) { 196 return c < NUMBER_OF_CHARACTERS && (MASKS[c] & NAME_START_CHARACTER) != 0; 197 } 198 199 /** 200 * Determine whether the supplied character is a valid first character in an XML NCName. The first character in an XML NCName 201 * is more restrictive than the {@link #isValidName(int) remaining characters}. 202 * 203 * @param c the character 204 * @return true if the character is valid for an XML NCName's first character 205 */ 206 public static boolean isValidNcNameStart( int c ) { 207 return c < NUMBER_OF_CHARACTERS && (MASKS[c] & NCNAME_START_CHARACTER) != 0; 208 } 209 210 /** 211 * Determine whether the supplied character is a valid non-first character in an XML Name. The {@link #isValidNameStart(int) 212 * first character} in an XML name is more restrictive than the remaining characters. 213 * 214 * @param c the character 215 * @return true if the character is valid character in an XML Name 216 */ 217 public static boolean isValidName( int c ) { 218 return c < NUMBER_OF_CHARACTERS && (MASKS[c] & NAME_CHARACTER) != 0; 219 } 220 221 /** 222 * Determine whether the supplied character is a valid non-first character in an XML NCName. The 223 * {@link #isValidNcNameStart(int) first character} in an XML NCName is more restrictive than the remaining characters. 224 * 225 * @param c the character 226 * @return true if the character is valid character in an XML NCName 227 */ 228 public static boolean isValidNcName( int c ) { 229 return c < NUMBER_OF_CHARACTERS && (MASKS[c] & NCNAME_CHARACTER) != 0; 230 } 231 232 /** 233 * Determine whether the supplied character is a valid character in an XML Pubid. 234 * 235 * @param c the character 236 * @return true if the character is valid character in an XML Pubid 237 */ 238 public static boolean isValidPubid( int c ) { 239 return c < NUMBER_OF_CHARACTERS && (MASKS[c] & PUBID_CHARACTER) != 0; 240 } 241 242 /** 243 * Determine whether the supplied character is a valid character in XML. 244 * 245 * @param c the character 246 * @return true if the character is valid character in XML 247 */ 248 public static boolean isValid( int c ) { 249 return (c < NUMBER_OF_CHARACTERS && (MASKS[c] & VALID_CHARACTER) != 0) || (0x10000 <= c && c <= 0x10FFFF); 250 } 251 252 /** 253 * Determine whether the supplied character is a valid character in XML content 254 * 255 * @param c the character 256 * @return true if the character is valid character in XML content 257 */ 258 public static boolean isValidContent( int c ) { 259 return (c < NUMBER_OF_CHARACTERS && (MASKS[c] & CONTENT_CHARACTER) != 0) || (0x10000 <= c && c <= 0x10FFFF); 260 } 261 262 /** 263 * Determine whether the supplied character is a valid whitespace character in XML 264 * 265 * @param c the character 266 * @return true if the character is valid whitespace character in XML 267 */ 268 public static boolean isValidSpace( int c ) { 269 return c <= 0x20 && (MASKS[c] & SPACE_CHARACTER) != 0; 270 } 271 272 /** 273 * Determine if the supplied name is a valid XML Name. 274 * 275 * @param name the string being checked 276 * @return true if the supplied name is indeed a valid XML Name, or false otherwise 277 */ 278 public static boolean isValidName( String name ) { 279 if (name == null || name.length() == 0) return false; 280 CharacterIterator iter = new StringCharacterIterator(name); 281 char c = iter.first(); 282 if (!isValidNameStart(c)) return false; 283 while (c != CharacterIterator.DONE) { 284 if (!isValidName(c)) return false; 285 c = iter.next(); 286 } 287 return true; 288 } 289 290 /** 291 * Determine if the supplied name is a valid XML NCName. 292 * 293 * @param name the string being checked 294 * @return true if the supplied name is indeed a valid XML NCName, or false otherwise 295 */ 296 public static boolean isValidNcName( String name ) { 297 if (name == null || name.length() == 0) return false; 298 CharacterIterator iter = new StringCharacterIterator(name); 299 char c = iter.first(); 300 if (!isValidNcNameStart(c)) return false; 301 while (c != CharacterIterator.DONE) { 302 if (!isValidNcName(c)) return false; 303 c = iter.next(); 304 } 305 return true; 306 } 307}