001/*
002 * ModeShape (http://www.modeshape.org)
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *       http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016package org.modeshape.common.text;
017
018import java.text.CharacterIterator;
019import java.text.StringCharacterIterator;
020import java.util.BitSet;
021import org.modeshape.common.annotation.Immutable;
022
023/**
024 * An {@link TextEncoder encoder} and {@link TextDecoder decoder} for XML element and attribute names.
025 * <p>
026 * Any UTF-16 unicode character that is not a valid XML name character according to the <a
027 * href="http://www.w3.org/TR/REC-xml/#sec-common-syn">World Wide Web Consortium (W3C) Extensible Markup Language (XML) 1.0
028 * (Fourth Edition) Recommendation</a> is escaped as <code>_xHHHH_</code>, where <code>HHHH</code> stands for the four-digit
029 * hexadecimal UTF-16 unicode value for the character in the most significant bit first order. For example, the name "Customer_ID"
030 * is encoded as "Customer_x0020_ID".
031 * </p>
032 * <p>
033 * Decoding transforms every <code>_xHHHH_</code> encoding sequences back into the UTF-16 character. Note that
034 * {@link #decode(String) decoding} can be safely done on any XML name, even if the name does not contain any encoded sequences.
035 * </p>
036 */
037@Immutable
038public class XmlNameEncoder implements TextDecoder, TextEncoder {
039
040    private static final BitSet XML_NAME_ALLOWED_CHARACTERS = new BitSet(2 ^ 16);
041
042    /**
043     * @see "<a href=http://www.w3.org/TR/xml/#NT-NameStartChar/>http://www.w3.org/TR/xml/#NT-NameStartChar</a>"
044     */
045    private static final BitSet XML_NAME_START_ALLOWED_CHARACTERS = new BitSet(2 ^ 16);
046
047    static {
048        // Initialize the unescaped bitset ...
049
050        // XML Names may contain: Letter | Digit | '.' | '-' | '_' | ':' | CombiningChar | Extender
051        XML_NAME_ALLOWED_CHARACTERS.set('.');
052        XML_NAME_ALLOWED_CHARACTERS.set('-');
053        XML_NAME_ALLOWED_CHARACTERS.set('_');
054        XML_NAME_ALLOWED_CHARACTERS.set(':');
055
056        // XML Base Character Set
057        XML_NAME_ALLOWED_CHARACTERS.set('\u0041', '\u005A' + 1);
058        XML_NAME_ALLOWED_CHARACTERS.set('\u0061', '\u007A' + 1);
059        XML_NAME_ALLOWED_CHARACTERS.set('\u00C0', '\u00D6' + 1);
060        XML_NAME_ALLOWED_CHARACTERS.set('\u00D8', '\u00F6' + 1);
061        XML_NAME_ALLOWED_CHARACTERS.set('\u00F8', '\u00FF' + 1);
062        XML_NAME_ALLOWED_CHARACTERS.set('\u0100', '\u0131' + 1);
063        XML_NAME_ALLOWED_CHARACTERS.set('\u0134', '\u013E' + 1);
064        XML_NAME_ALLOWED_CHARACTERS.set('\u0141', '\u0148' + 1);
065        XML_NAME_ALLOWED_CHARACTERS.set('\u014A', '\u017E' + 1);
066        XML_NAME_ALLOWED_CHARACTERS.set('\u0180', '\u01C3' + 1);
067        XML_NAME_ALLOWED_CHARACTERS.set('\u01CD', '\u01F0' + 1);
068        XML_NAME_ALLOWED_CHARACTERS.set('\u01F4', '\u01F5' + 1);
069        XML_NAME_ALLOWED_CHARACTERS.set('\u01FA', '\u0217' + 1);
070        XML_NAME_ALLOWED_CHARACTERS.set('\u0250', '\u02A8' + 1);
071        XML_NAME_ALLOWED_CHARACTERS.set('\u02BB', '\u02C1' + 1);
072        XML_NAME_ALLOWED_CHARACTERS.set('\u0386');
073        XML_NAME_ALLOWED_CHARACTERS.set('\u0388', '\u038A' + 1);
074        XML_NAME_ALLOWED_CHARACTERS.set('\u038C');
075        XML_NAME_ALLOWED_CHARACTERS.set('\u038E', '\u03A1' + 1);
076        XML_NAME_ALLOWED_CHARACTERS.set('\u03A3', '\u03CE' + 1);
077        XML_NAME_ALLOWED_CHARACTERS.set('\u03D0', '\u03D6' + 1);
078        XML_NAME_ALLOWED_CHARACTERS.set('\u03DA');
079        XML_NAME_ALLOWED_CHARACTERS.set('\u03DC');
080        XML_NAME_ALLOWED_CHARACTERS.set('\u03DE');
081        XML_NAME_ALLOWED_CHARACTERS.set('\u03E0');
082        XML_NAME_ALLOWED_CHARACTERS.set('\u03E2', '\u03F3' + 1);
083        XML_NAME_ALLOWED_CHARACTERS.set('\u0401', '\u040C' + 1);
084        XML_NAME_ALLOWED_CHARACTERS.set('\u040E', '\u044F' + 1);
085        XML_NAME_ALLOWED_CHARACTERS.set('\u0451', '\u045C' + 1);
086        XML_NAME_ALLOWED_CHARACTERS.set('\u045E', '\u0481' + 1);
087        XML_NAME_ALLOWED_CHARACTERS.set('\u0490', '\u04C4' + 1);
088        XML_NAME_ALLOWED_CHARACTERS.set('\u04C7', '\u04C8' + 1);
089        XML_NAME_ALLOWED_CHARACTERS.set('\u04CB', '\u04CC' + 1);
090        XML_NAME_ALLOWED_CHARACTERS.set('\u04D0', '\u04EB' + 1);
091        XML_NAME_ALLOWED_CHARACTERS.set('\u04EE', '\u04F5' + 1);
092        XML_NAME_ALLOWED_CHARACTERS.set('\u04F8', '\u04F9' + 1);
093        XML_NAME_ALLOWED_CHARACTERS.set('\u0531', '\u0556' + 1);
094        XML_NAME_ALLOWED_CHARACTERS.set('\u0559');
095        XML_NAME_ALLOWED_CHARACTERS.set('\u0561', '\u0586' + 1);
096        XML_NAME_ALLOWED_CHARACTERS.set('\u05D0', '\u05EA' + 1);
097        XML_NAME_ALLOWED_CHARACTERS.set('\u05F0', '\u05F2' + 1);
098        XML_NAME_ALLOWED_CHARACTERS.set('\u0621', '\u063A' + 1);
099        XML_NAME_ALLOWED_CHARACTERS.set('\u0641', '\u064A' + 1);
100        XML_NAME_ALLOWED_CHARACTERS.set('\u0671', '\u06B7' + 1);
101        XML_NAME_ALLOWED_CHARACTERS.set('\u06BA', '\u06BE' + 1);
102        XML_NAME_ALLOWED_CHARACTERS.set('\u06C0', '\u06CE' + 1);
103        XML_NAME_ALLOWED_CHARACTERS.set('\u06D0', '\u06D3' + 1);
104        XML_NAME_ALLOWED_CHARACTERS.set('\u06D5');
105        XML_NAME_ALLOWED_CHARACTERS.set('\u06E5', '\u06E6' + 1);
106        XML_NAME_ALLOWED_CHARACTERS.set('\u0905', '\u0939' + 1);
107        XML_NAME_ALLOWED_CHARACTERS.set('\u093D');
108        XML_NAME_ALLOWED_CHARACTERS.set('\u0958', '\u0961' + 1);
109        XML_NAME_ALLOWED_CHARACTERS.set('\u0985', '\u098C' + 1);
110        XML_NAME_ALLOWED_CHARACTERS.set('\u098F', '\u0990' + 1);
111        XML_NAME_ALLOWED_CHARACTERS.set('\u0993', '\u09A8' + 1);
112        XML_NAME_ALLOWED_CHARACTERS.set('\u09AA', '\u09B0' + 1);
113        XML_NAME_ALLOWED_CHARACTERS.set('\u09B2');
114        XML_NAME_ALLOWED_CHARACTERS.set('\u09B6', '\u09B9' + 1);
115        XML_NAME_ALLOWED_CHARACTERS.set('\u09DC', '\u09DD' + 1);
116        XML_NAME_ALLOWED_CHARACTERS.set('\u09DF', '\u09E1' + 1);
117        XML_NAME_ALLOWED_CHARACTERS.set('\u09F0', '\u09F1' + 1);
118        XML_NAME_ALLOWED_CHARACTERS.set('\u0A05', '\u0A0A' + 1);
119        XML_NAME_ALLOWED_CHARACTERS.set('\u0A0F', '\u0A10' + 1);
120        XML_NAME_ALLOWED_CHARACTERS.set('\u0A13', '\u0A28' + 1);
121        XML_NAME_ALLOWED_CHARACTERS.set('\u0A2A', '\u0A30' + 1);
122        XML_NAME_ALLOWED_CHARACTERS.set('\u0A32', '\u0A33' + 1);
123        XML_NAME_ALLOWED_CHARACTERS.set('\u0A35', '\u0A36' + 1);
124        XML_NAME_ALLOWED_CHARACTERS.set('\u0A38', '\u0A39' + 1);
125        XML_NAME_ALLOWED_CHARACTERS.set('\u0A59', '\u0A5C' + 1);
126        XML_NAME_ALLOWED_CHARACTERS.set('\u0A5E');
127        XML_NAME_ALLOWED_CHARACTERS.set('\u0A72', '\u0A74' + 1);
128        XML_NAME_ALLOWED_CHARACTERS.set('\u0A85', '\u0A8B' + 1);
129        XML_NAME_ALLOWED_CHARACTERS.set('\u0A8D');
130        XML_NAME_ALLOWED_CHARACTERS.set('\u0A8F', '\u0A91' + 1);
131        XML_NAME_ALLOWED_CHARACTERS.set('\u0A93', '\u0AA8' + 1);
132        XML_NAME_ALLOWED_CHARACTERS.set('\u0AAA', '\u0AB0' + 1);
133        XML_NAME_ALLOWED_CHARACTERS.set('\u0AB2', '\u0AB3' + 1);
134        XML_NAME_ALLOWED_CHARACTERS.set('\u0AB5', '\u0AB9' + 1);
135        XML_NAME_ALLOWED_CHARACTERS.set('\u0ABD');
136        XML_NAME_ALLOWED_CHARACTERS.set('\u0AE0');
137        XML_NAME_ALLOWED_CHARACTERS.set('\u0B05', '\u0B0C' + 1);
138        XML_NAME_ALLOWED_CHARACTERS.set('\u0B0F', '\u0B10' + 1);
139        XML_NAME_ALLOWED_CHARACTERS.set('\u0B13', '\u0B28' + 1);
140        XML_NAME_ALLOWED_CHARACTERS.set('\u0B2A', '\u0B30' + 1);
141        XML_NAME_ALLOWED_CHARACTERS.set('\u0B32', '\u0B33' + 1);
142        XML_NAME_ALLOWED_CHARACTERS.set('\u0B36', '\u0B39' + 1);
143        XML_NAME_ALLOWED_CHARACTERS.set('\u0B3D');
144        XML_NAME_ALLOWED_CHARACTERS.set('\u0B5C', '\u0B5D' + 1);
145        XML_NAME_ALLOWED_CHARACTERS.set('\u0B5F', '\u0B61' + 1);
146        XML_NAME_ALLOWED_CHARACTERS.set('\u0B85', '\u0B8A' + 1);
147        XML_NAME_ALLOWED_CHARACTERS.set('\u0B8E', '\u0B90' + 1);
148        XML_NAME_ALLOWED_CHARACTERS.set('\u0B92', '\u0B95' + 1);
149        XML_NAME_ALLOWED_CHARACTERS.set('\u0B99', '\u0B9A' + 1);
150        XML_NAME_ALLOWED_CHARACTERS.set('\u0B9C');
151        XML_NAME_ALLOWED_CHARACTERS.set('\u0B9E', '\u0B9F' + 1);
152        XML_NAME_ALLOWED_CHARACTERS.set('\u0BA3', '\u0BA4' + 1);
153        XML_NAME_ALLOWED_CHARACTERS.set('\u0BA8', '\u0BAA' + 1);
154        XML_NAME_ALLOWED_CHARACTERS.set('\u0BAE', '\u0BB5' + 1);
155        XML_NAME_ALLOWED_CHARACTERS.set('\u0BB7', '\u0BB9' + 1);
156        XML_NAME_ALLOWED_CHARACTERS.set('\u0C05', '\u0C0C' + 1);
157        XML_NAME_ALLOWED_CHARACTERS.set('\u0C0E', '\u0C10' + 1);
158        XML_NAME_ALLOWED_CHARACTERS.set('\u0C12', '\u0C28' + 1);
159        XML_NAME_ALLOWED_CHARACTERS.set('\u0C2A', '\u0C33' + 1);
160        XML_NAME_ALLOWED_CHARACTERS.set('\u0C35', '\u0C39' + 1);
161        XML_NAME_ALLOWED_CHARACTERS.set('\u0C60', '\u0C61' + 1);
162        XML_NAME_ALLOWED_CHARACTERS.set('\u0C85', '\u0C8C' + 1);
163        XML_NAME_ALLOWED_CHARACTERS.set('\u0C8E', '\u0C90' + 1);
164        XML_NAME_ALLOWED_CHARACTERS.set('\u0C92', '\u0CA8' + 1);
165        XML_NAME_ALLOWED_CHARACTERS.set('\u0CAA', '\u0CB3' + 1);
166        XML_NAME_ALLOWED_CHARACTERS.set('\u0CB5', '\u0CB9' + 1);
167        XML_NAME_ALLOWED_CHARACTERS.set('\u0CDE');
168        XML_NAME_ALLOWED_CHARACTERS.set('\u0CE0', '\u0CE1' + 1);
169        XML_NAME_ALLOWED_CHARACTERS.set('\u0D05', '\u0D0C' + 1);
170        XML_NAME_ALLOWED_CHARACTERS.set('\u0D0E', '\u0D10' + 1);
171        XML_NAME_ALLOWED_CHARACTERS.set('\u0D12', '\u0D28' + 1);
172        XML_NAME_ALLOWED_CHARACTERS.set('\u0D2A', '\u0D39' + 1);
173        XML_NAME_ALLOWED_CHARACTERS.set('\u0D60', '\u0D61' + 1);
174        XML_NAME_ALLOWED_CHARACTERS.set('\u0E01', '\u0E2E' + 1);
175        XML_NAME_ALLOWED_CHARACTERS.set('\u0E30');
176        XML_NAME_ALLOWED_CHARACTERS.set('\u0E32', '\u0E33' + 1);
177        XML_NAME_ALLOWED_CHARACTERS.set('\u0E40', '\u0E45' + 1);
178        XML_NAME_ALLOWED_CHARACTERS.set('\u0E81', '\u0E82' + 1);
179        XML_NAME_ALLOWED_CHARACTERS.set('\u0E84');
180        XML_NAME_ALLOWED_CHARACTERS.set('\u0E87', '\u0E88' + 1);
181        XML_NAME_ALLOWED_CHARACTERS.set('\u0E8A');
182        XML_NAME_ALLOWED_CHARACTERS.set('\u0E8D');
183        XML_NAME_ALLOWED_CHARACTERS.set('\u0E94', '\u0E97' + 1);
184        XML_NAME_ALLOWED_CHARACTERS.set('\u0E99', '\u0E9F' + 1);
185        XML_NAME_ALLOWED_CHARACTERS.set('\u0EA1', '\u0EA3' + 1);
186        XML_NAME_ALLOWED_CHARACTERS.set('\u0EA5');
187        XML_NAME_ALLOWED_CHARACTERS.set('\u0EA7');
188        XML_NAME_ALLOWED_CHARACTERS.set('\u0EAA', '\u0EAB' + 1);
189        XML_NAME_ALLOWED_CHARACTERS.set('\u0EAD', '\u0EAE' + 1);
190        XML_NAME_ALLOWED_CHARACTERS.set('\u0EB0');
191        XML_NAME_ALLOWED_CHARACTERS.set('\u0EB2', '\u0EB3' + 1);
192        XML_NAME_ALLOWED_CHARACTERS.set('\u0EBD');
193        XML_NAME_ALLOWED_CHARACTERS.set('\u0EC0', '\u0EC4' + 1);
194        XML_NAME_ALLOWED_CHARACTERS.set('\u0F40', '\u0F47' + 1);
195        XML_NAME_ALLOWED_CHARACTERS.set('\u0F49', '\u0F69' + 1);
196        XML_NAME_ALLOWED_CHARACTERS.set('\u10A0', '\u10C5' + 1);
197        XML_NAME_ALLOWED_CHARACTERS.set('\u10D0', '\u10F6' + 1);
198        XML_NAME_ALLOWED_CHARACTERS.set('\u1100');
199        XML_NAME_ALLOWED_CHARACTERS.set('\u1102', '\u1103' + 1);
200        XML_NAME_ALLOWED_CHARACTERS.set('\u1105', '\u1107' + 1);
201        XML_NAME_ALLOWED_CHARACTERS.set('\u1109');
202        XML_NAME_ALLOWED_CHARACTERS.set('\u110B', '\u110C' + 1);
203        XML_NAME_ALLOWED_CHARACTERS.set('\u110E', '\u1112' + 1);
204        XML_NAME_ALLOWED_CHARACTERS.set('\u113C');
205        XML_NAME_ALLOWED_CHARACTERS.set('\u113E');
206        XML_NAME_ALLOWED_CHARACTERS.set('\u1140');
207        XML_NAME_ALLOWED_CHARACTERS.set('\u114C');
208        XML_NAME_ALLOWED_CHARACTERS.set('\u114E');
209        XML_NAME_ALLOWED_CHARACTERS.set('\u1150');
210        XML_NAME_ALLOWED_CHARACTERS.set('\u1154', '\u1155' + 1);
211        XML_NAME_ALLOWED_CHARACTERS.set('\u1159');
212        XML_NAME_ALLOWED_CHARACTERS.set('\u115F', '\u1161' + 1);
213        XML_NAME_ALLOWED_CHARACTERS.set('\u1163');
214        XML_NAME_ALLOWED_CHARACTERS.set('\u1165');
215        XML_NAME_ALLOWED_CHARACTERS.set('\u1167');
216        XML_NAME_ALLOWED_CHARACTERS.set('\u1169');
217        XML_NAME_ALLOWED_CHARACTERS.set('\u116D', '\u116E' + 1);
218        XML_NAME_ALLOWED_CHARACTERS.set('\u1172', '\u1173' + 1);
219        XML_NAME_ALLOWED_CHARACTERS.set('\u1175');
220        XML_NAME_ALLOWED_CHARACTERS.set('\u119E');
221        XML_NAME_ALLOWED_CHARACTERS.set('\u11A8');
222        XML_NAME_ALLOWED_CHARACTERS.set('\u11AB');
223        XML_NAME_ALLOWED_CHARACTERS.set('\u11AE', '\u11AF' + 1);
224        XML_NAME_ALLOWED_CHARACTERS.set('\u11B7', '\u11B8' + 1);
225        XML_NAME_ALLOWED_CHARACTERS.set('\u11BA');
226        XML_NAME_ALLOWED_CHARACTERS.set('\u11BC', '\u11C2' + 1);
227        XML_NAME_ALLOWED_CHARACTERS.set('\u11EB');
228        XML_NAME_ALLOWED_CHARACTERS.set('\u11F0');
229        XML_NAME_ALLOWED_CHARACTERS.set('\u11F9');
230        XML_NAME_ALLOWED_CHARACTERS.set('\u1E00', '\u1E9B' + 1);
231        XML_NAME_ALLOWED_CHARACTERS.set('\u1EA0', '\u1EF9' + 1);
232        XML_NAME_ALLOWED_CHARACTERS.set('\u1F00', '\u1F15' + 1);
233        XML_NAME_ALLOWED_CHARACTERS.set('\u1F18', '\u1F1D' + 1);
234        XML_NAME_ALLOWED_CHARACTERS.set('\u1F20', '\u1F45' + 1);
235        XML_NAME_ALLOWED_CHARACTERS.set('\u1F48', '\u1F4D' + 1);
236        XML_NAME_ALLOWED_CHARACTERS.set('\u1F50', '\u1F57' + 1);
237        XML_NAME_ALLOWED_CHARACTERS.set('\u1F59');
238        XML_NAME_ALLOWED_CHARACTERS.set('\u1F5B');
239        XML_NAME_ALLOWED_CHARACTERS.set('\u1F5D');
240        XML_NAME_ALLOWED_CHARACTERS.set('\u1F5F', '\u1F7D' + 1);
241        XML_NAME_ALLOWED_CHARACTERS.set('\u1F80', '\u1FB4' + 1);
242        XML_NAME_ALLOWED_CHARACTERS.set('\u1FB6', '\u1FBC' + 1);
243        XML_NAME_ALLOWED_CHARACTERS.set('\u1FBE');
244        XML_NAME_ALLOWED_CHARACTERS.set('\u1FC2', '\u1FC4' + 1);
245        XML_NAME_ALLOWED_CHARACTERS.set('\u1FC6', '\u1FCC' + 1);
246        XML_NAME_ALLOWED_CHARACTERS.set('\u1FD0', '\u1FD3' + 1);
247        XML_NAME_ALLOWED_CHARACTERS.set('\u1FD6', '\u1FDB' + 1);
248        XML_NAME_ALLOWED_CHARACTERS.set('\u1FE0', '\u1FEC' + 1);
249        XML_NAME_ALLOWED_CHARACTERS.set('\u1FF2', '\u1FF4' + 1);
250        XML_NAME_ALLOWED_CHARACTERS.set('\u1FF6', '\u1FFC' + 1);
251        XML_NAME_ALLOWED_CHARACTERS.set('\u2126');
252        XML_NAME_ALLOWED_CHARACTERS.set('\u212A', '\u212B' + 1);
253        XML_NAME_ALLOWED_CHARACTERS.set('\u212E');
254        XML_NAME_ALLOWED_CHARACTERS.set('\u2180', '\u2182' + 1);
255        XML_NAME_ALLOWED_CHARACTERS.set('\u3041', '\u3094' + 1);
256        XML_NAME_ALLOWED_CHARACTERS.set('\u30A1', '\u30FA' + 1);
257        XML_NAME_ALLOWED_CHARACTERS.set('\u3105', '\u312C' + 1);
258        XML_NAME_ALLOWED_CHARACTERS.set('\uAC00', '\uD7A3' + 1);
259
260        // XML Ideograph Character Set
261
262        XML_NAME_ALLOWED_CHARACTERS.set('\u4E00', '\u9FA5' + 1);
263        XML_NAME_ALLOWED_CHARACTERS.set('\u3007');
264        XML_NAME_ALLOWED_CHARACTERS.set('\u3021', '\u3029' + 1);
265
266        // XML Combining Character Set
267
268        XML_NAME_ALLOWED_CHARACTERS.set('\u0300', '\u0345' + 1);
269        XML_NAME_ALLOWED_CHARACTERS.set('\u0360', '\u0361' + 1);
270        XML_NAME_ALLOWED_CHARACTERS.set('\u0483', '\u0486' + 1);
271        XML_NAME_ALLOWED_CHARACTERS.set('\u0591', '\u05A1' + 1);
272        XML_NAME_ALLOWED_CHARACTERS.set('\u05A3', '\u05B9' + 1);
273        XML_NAME_ALLOWED_CHARACTERS.set('\u05BB', '\u05BD' + 1);
274        XML_NAME_ALLOWED_CHARACTERS.set('\u05BF');
275        XML_NAME_ALLOWED_CHARACTERS.set('\u05C1', '\u05C2' + 1);
276        XML_NAME_ALLOWED_CHARACTERS.set('\u05C4');
277        XML_NAME_ALLOWED_CHARACTERS.set('\u064B', '\u0652' + 1);
278        XML_NAME_ALLOWED_CHARACTERS.set('\u0670');
279        XML_NAME_ALLOWED_CHARACTERS.set('\u06D6', '\u06DC' + 1);
280        XML_NAME_ALLOWED_CHARACTERS.set('\u06DD', '\u06DF' + 1);
281        XML_NAME_ALLOWED_CHARACTERS.set('\u06E0', '\u06E4' + 1);
282        XML_NAME_ALLOWED_CHARACTERS.set('\u06E7', '\u06E8' + 1);
283        XML_NAME_ALLOWED_CHARACTERS.set('\u06EA', '\u06ED' + 1);
284        XML_NAME_ALLOWED_CHARACTERS.set('\u0901', '\u0903' + 1);
285        XML_NAME_ALLOWED_CHARACTERS.set('\u093C');
286        XML_NAME_ALLOWED_CHARACTERS.set('\u093E', '\u094C' + 1);
287        XML_NAME_ALLOWED_CHARACTERS.set('\u094D');
288        XML_NAME_ALLOWED_CHARACTERS.set('\u0951', '\u0954' + 1);
289        XML_NAME_ALLOWED_CHARACTERS.set('\u0962', '\u0963' + 1);
290        XML_NAME_ALLOWED_CHARACTERS.set('\u0981', '\u0983' + 1);
291        XML_NAME_ALLOWED_CHARACTERS.set('\u09BC');
292        XML_NAME_ALLOWED_CHARACTERS.set('\u09BE');
293        XML_NAME_ALLOWED_CHARACTERS.set('\u09BF');
294        XML_NAME_ALLOWED_CHARACTERS.set('\u09C0', '\u09C4' + 1);
295        XML_NAME_ALLOWED_CHARACTERS.set('\u09C7', '\u09C8' + 1);
296        XML_NAME_ALLOWED_CHARACTERS.set('\u09CB', '\u09CD' + 1);
297        XML_NAME_ALLOWED_CHARACTERS.set('\u09D7');
298        XML_NAME_ALLOWED_CHARACTERS.set('\u09E2', '\u09E3' + 1);
299        XML_NAME_ALLOWED_CHARACTERS.set('\u0A02');
300        XML_NAME_ALLOWED_CHARACTERS.set('\u0A3C');
301        XML_NAME_ALLOWED_CHARACTERS.set('\u0A3E');
302        XML_NAME_ALLOWED_CHARACTERS.set('\u0A3F');
303        XML_NAME_ALLOWED_CHARACTERS.set('\u0A40', '\u0A42' + 1);
304        XML_NAME_ALLOWED_CHARACTERS.set('\u0A47', '\u0A48' + 1);
305        XML_NAME_ALLOWED_CHARACTERS.set('\u0A4B', '\u0A4D' + 1);
306        XML_NAME_ALLOWED_CHARACTERS.set('\u0A70', '\u0A71' + 1);
307        XML_NAME_ALLOWED_CHARACTERS.set('\u0A81', '\u0A83' + 1);
308        XML_NAME_ALLOWED_CHARACTERS.set('\u0ABC');
309        XML_NAME_ALLOWED_CHARACTERS.set('\u0ABE', '\u0AC5' + 1);
310        XML_NAME_ALLOWED_CHARACTERS.set('\u0AC7', '\u0AC9' + 1);
311        XML_NAME_ALLOWED_CHARACTERS.set('\u0ACB', '\u0ACD' + 1);
312        XML_NAME_ALLOWED_CHARACTERS.set('\u0B01', '\u0B03' + 1);
313        XML_NAME_ALLOWED_CHARACTERS.set('\u0B3C');
314        XML_NAME_ALLOWED_CHARACTERS.set('\u0B3E', '\u0B43' + 1);
315        XML_NAME_ALLOWED_CHARACTERS.set('\u0B47', '\u0B48' + 1);
316        XML_NAME_ALLOWED_CHARACTERS.set('\u0B4B', '\u0B4D' + 1);
317        XML_NAME_ALLOWED_CHARACTERS.set('\u0B56', '\u0B57' + 1);
318        XML_NAME_ALLOWED_CHARACTERS.set('\u0B82', '\u0B83' + 1);
319        XML_NAME_ALLOWED_CHARACTERS.set('\u0BBE', '\u0BC2' + 1);
320        XML_NAME_ALLOWED_CHARACTERS.set('\u0BC6', '\u0BC8' + 1);
321        XML_NAME_ALLOWED_CHARACTERS.set('\u0BCA', '\u0BCD' + 1);
322        XML_NAME_ALLOWED_CHARACTERS.set('\u0BD7');
323        XML_NAME_ALLOWED_CHARACTERS.set('\u0C01', '\u0C03' + 1);
324        XML_NAME_ALLOWED_CHARACTERS.set('\u0C3E', '\u0C44' + 1);
325        XML_NAME_ALLOWED_CHARACTERS.set('\u0C46', '\u0C48' + 1);
326        XML_NAME_ALLOWED_CHARACTERS.set('\u0C4A', '\u0C4D' + 1);
327        XML_NAME_ALLOWED_CHARACTERS.set('\u0C55', '\u0C56' + 1);
328        XML_NAME_ALLOWED_CHARACTERS.set('\u0C82', '\u0C83' + 1);
329        XML_NAME_ALLOWED_CHARACTERS.set('\u0CBE', '\u0CC4' + 1);
330        XML_NAME_ALLOWED_CHARACTERS.set('\u0CC6', '\u0CC8' + 1);
331        XML_NAME_ALLOWED_CHARACTERS.set('\u0CCA', '\u0CCD' + 1);
332        XML_NAME_ALLOWED_CHARACTERS.set('\u0CD5', '\u0CD6' + 1);
333        XML_NAME_ALLOWED_CHARACTERS.set('\u0D02', '\u0D03' + 1);
334        XML_NAME_ALLOWED_CHARACTERS.set('\u0D3E', '\u0D43' + 1);
335        XML_NAME_ALLOWED_CHARACTERS.set('\u0D46', '\u0D48' + 1);
336        XML_NAME_ALLOWED_CHARACTERS.set('\u0D4A', '\u0D4D' + 1);
337        XML_NAME_ALLOWED_CHARACTERS.set('\u0D57');
338        XML_NAME_ALLOWED_CHARACTERS.set('\u0E31');
339        XML_NAME_ALLOWED_CHARACTERS.set('\u0E34', '\u0E3A' + 1);
340        XML_NAME_ALLOWED_CHARACTERS.set('\u0E47', '\u0E4E' + 1);
341        XML_NAME_ALLOWED_CHARACTERS.set('\u0EB1');
342        XML_NAME_ALLOWED_CHARACTERS.set('\u0EB4', '\u0EB9' + 1);
343        XML_NAME_ALLOWED_CHARACTERS.set('\u0EBB', '\u0EBC' + 1);
344        XML_NAME_ALLOWED_CHARACTERS.set('\u0EC8', '\u0ECD' + 1);
345        XML_NAME_ALLOWED_CHARACTERS.set('\u0F18', '\u0F19' + 1);
346        XML_NAME_ALLOWED_CHARACTERS.set('\u0F35');
347        XML_NAME_ALLOWED_CHARACTERS.set('\u0F37');
348        XML_NAME_ALLOWED_CHARACTERS.set('\u0F39');
349        XML_NAME_ALLOWED_CHARACTERS.set('\u0F3E');
350        XML_NAME_ALLOWED_CHARACTERS.set('\u0F3F');
351        XML_NAME_ALLOWED_CHARACTERS.set('\u0F71', '\u0F84' + 1);
352        XML_NAME_ALLOWED_CHARACTERS.set('\u0F86', '\u0F8B' + 1);
353        XML_NAME_ALLOWED_CHARACTERS.set('\u0F90', '\u0F95' + 1);
354        XML_NAME_ALLOWED_CHARACTERS.set('\u0F97');
355        XML_NAME_ALLOWED_CHARACTERS.set('\u0F99', '\u0FAD' + 1);
356        XML_NAME_ALLOWED_CHARACTERS.set('\u0FB1', '\u0FB7' + 1);
357        XML_NAME_ALLOWED_CHARACTERS.set('\u0FB9');
358        XML_NAME_ALLOWED_CHARACTERS.set('\u20D0', '\u20DC' + 1);
359        XML_NAME_ALLOWED_CHARACTERS.set('\u20E1');
360        XML_NAME_ALLOWED_CHARACTERS.set('\u302A', '\u302F' + 1);
361        XML_NAME_ALLOWED_CHARACTERS.set('\u3099');
362        XML_NAME_ALLOWED_CHARACTERS.set('\u309A');
363
364        // XML Digits
365        XML_NAME_ALLOWED_CHARACTERS.set('\u0030', '\u0039' + 1);
366        XML_NAME_ALLOWED_CHARACTERS.set('\u0660', '\u0669' + 1);
367        XML_NAME_ALLOWED_CHARACTERS.set('\u06F0', '\u06F9' + 1);
368        XML_NAME_ALLOWED_CHARACTERS.set('\u0966', '\u096F' + 1);
369        XML_NAME_ALLOWED_CHARACTERS.set('\u09E6', '\u09EF' + 1);
370        XML_NAME_ALLOWED_CHARACTERS.set('\u0A66', '\u0A6F' + 1);
371        XML_NAME_ALLOWED_CHARACTERS.set('\u0AE6', '\u0AEF' + 1);
372        XML_NAME_ALLOWED_CHARACTERS.set('\u0B66', '\u0B6F' + 1);
373        XML_NAME_ALLOWED_CHARACTERS.set('\u0BE7', '\u0BEF' + 1);
374        XML_NAME_ALLOWED_CHARACTERS.set('\u0C66', '\u0C6F' + 1);
375        XML_NAME_ALLOWED_CHARACTERS.set('\u0CE6', '\u0CEF' + 1);
376        XML_NAME_ALLOWED_CHARACTERS.set('\u0D66', '\u0D6F' + 1);
377        XML_NAME_ALLOWED_CHARACTERS.set('\u0E50', '\u0E59' + 1);
378        XML_NAME_ALLOWED_CHARACTERS.set('\u0ED0', '\u0ED9' + 1);
379        XML_NAME_ALLOWED_CHARACTERS.set('\u0F20', '\u0F29' + 1);
380
381        // XML Extenders
382        XML_NAME_ALLOWED_CHARACTERS.set('\u00B7');
383        XML_NAME_ALLOWED_CHARACTERS.set('\u02D0');
384        XML_NAME_ALLOWED_CHARACTERS.set('\u02D1');
385        XML_NAME_ALLOWED_CHARACTERS.set('\u0387');
386        XML_NAME_ALLOWED_CHARACTERS.set('\u0640');
387        XML_NAME_ALLOWED_CHARACTERS.set('\u0E46');
388        XML_NAME_ALLOWED_CHARACTERS.set('\u0EC6');
389        XML_NAME_ALLOWED_CHARACTERS.set('\u3005');
390        XML_NAME_ALLOWED_CHARACTERS.set('\u3031', '\u3035' + 1);
391        XML_NAME_ALLOWED_CHARACTERS.set('\u309D', '\u309E' + 1);
392        XML_NAME_ALLOWED_CHARACTERS.set('\u30FC', '\u30FE' + 1);
393
394        XML_NAME_START_ALLOWED_CHARACTERS.or(XML_NAME_ALLOWED_CHARACTERS);
395        // remove . and -
396        XML_NAME_START_ALLOWED_CHARACTERS.clear('-');
397        XML_NAME_START_ALLOWED_CHARACTERS.clear('.');
398
399        // remove all digits
400        XML_NAME_START_ALLOWED_CHARACTERS.clear('\u0030', '\u0039' + 1);
401        XML_NAME_START_ALLOWED_CHARACTERS.clear('\u0660', '\u0669' + 1);
402        XML_NAME_START_ALLOWED_CHARACTERS.clear('\u06F0', '\u06F9' + 1);
403        XML_NAME_START_ALLOWED_CHARACTERS.clear('\u0966', '\u096F' + 1);
404        XML_NAME_START_ALLOWED_CHARACTERS.clear('\u09E6', '\u09EF' + 1);
405        XML_NAME_START_ALLOWED_CHARACTERS.clear('\u0A66', '\u0A6F' + 1);
406        XML_NAME_START_ALLOWED_CHARACTERS.clear('\u0AE6', '\u0AEF' + 1);
407        XML_NAME_START_ALLOWED_CHARACTERS.clear('\u0B66', '\u0B6F' + 1);
408        XML_NAME_START_ALLOWED_CHARACTERS.clear('\u0BE7', '\u0BEF' + 1);
409        XML_NAME_START_ALLOWED_CHARACTERS.clear('\u0C66', '\u0C6F' + 1);
410        XML_NAME_START_ALLOWED_CHARACTERS.clear('\u0CE6', '\u0CEF' + 1);
411        XML_NAME_START_ALLOWED_CHARACTERS.clear('\u0D66', '\u0D6F' + 1);
412        XML_NAME_START_ALLOWED_CHARACTERS.clear('\u0E50', '\u0E59' + 1);
413        XML_NAME_START_ALLOWED_CHARACTERS.clear('\u0ED0', '\u0ED9' + 1);
414        XML_NAME_START_ALLOWED_CHARACTERS.clear('\u0F20', '\u0F29' + 1);
415
416        // remove extender characters
417        XML_NAME_START_ALLOWED_CHARACTERS.clear('\u00B7');
418        XML_NAME_START_ALLOWED_CHARACTERS.clear('\u02D0');
419        XML_NAME_START_ALLOWED_CHARACTERS.clear('\u02D1');
420        XML_NAME_START_ALLOWED_CHARACTERS.clear('\u0387');
421        XML_NAME_START_ALLOWED_CHARACTERS.clear('\u0640');
422        XML_NAME_START_ALLOWED_CHARACTERS.clear('\u0E46');
423        XML_NAME_START_ALLOWED_CHARACTERS.clear('\u0EC6');
424        XML_NAME_START_ALLOWED_CHARACTERS.clear('\u3005');
425        XML_NAME_START_ALLOWED_CHARACTERS.clear('\u3031', '\u3035' + 1);
426        XML_NAME_START_ALLOWED_CHARACTERS.clear('\u309D', '\u309E' + 1);
427        XML_NAME_START_ALLOWED_CHARACTERS.clear('\u30FC', '\u30FE' + 1);
428
429    }
430
431    /**
432     * {@inheritDoc}
433     * 
434     * @see org.modeshape.common.text.TextDecoder#decode(java.lang.String)
435     */
436    @Override
437    public String decode( String encodedText ) {
438        if (encodedText == null) return null;
439        if (encodedText.length() < 7) {
440            // Not big enough to have an encoded sequence
441            return encodedText;
442        }
443        StringBuilder sb = new StringBuilder();
444        char[] digits = new char[4];
445        CharacterIterator iter = new StringCharacterIterator(encodedText);
446        for (char c = iter.first(); c != CharacterIterator.DONE; c = iter.next()) {
447            if (c == '_') {
448                // Read the next character, if there is one ...
449                char next = iter.next();
450                if (next == CharacterIterator.DONE) {
451                    sb.append(c);
452                    break;
453                }
454                // If the next character is not 'x', then these are just regular characters ...
455                if (next != 'x') {
456                    sb.append(c).append(next);
457                    continue;
458                }
459                // Read the next 4 characters (digits) and another '_' character ...
460                digits[0] = iter.next();
461                if (digits[0] == CharacterIterator.DONE) {
462                    sb.append(c).append(next);
463                    break;
464                }
465                digits[1] = iter.next();
466                if (digits[1] == CharacterIterator.DONE) {
467                    sb.append(c).append(next).append(digits, 0, 1);
468                    break;
469                }
470                digits[2] = iter.next();
471                if (digits[2] == CharacterIterator.DONE) {
472                    sb.append(c).append(next).append(digits, 0, 2);
473                    break;
474                }
475                digits[3] = iter.next();
476                if (digits[3] == CharacterIterator.DONE) {
477                    sb.append(c).append(next).append(digits, 0, 3);
478                    break;
479                }
480                char underscore = iter.next();
481                if (underscore != '_') { // includes DONE
482                    sb.append(c).append(next).append(digits, 0, 4);
483                    if (underscore == CharacterIterator.DONE) break;
484                    sb.append(underscore);
485                    continue;
486                }
487                // We've read all 4 digits, including the trailing '_'
488                // Now parse into the resulting character
489                try {
490                    sb.appendCodePoint(Integer.parseInt(new String(digits), 16));
491                } catch (NumberFormatException e) {
492                    // code was not hexadecimal, so just write out the characters as is ...
493                    sb.append(c).append(next).append(digits).append(underscore);
494                }
495            } else {
496                // Just append other characters ...
497                sb.append(c);
498            }
499        }
500        return sb.toString();
501    }
502
503    @Override
504    public String encode( String text ) {
505        if (text == null) return null;
506        if (text.length() == 0) return text;
507        StringBuilder sb = new StringBuilder();
508        String hex = null;
509        CharacterIterator iter = new StringCharacterIterator(text);
510        for (char c = iter.first(); c != CharacterIterator.DONE; c = iter.next()) {
511            if (c == '_') {
512                // Read the next character (if there is one) ...
513                char next = iter.next();
514                if (next == CharacterIterator.DONE) {
515                    sb.append(c);
516                    break;
517                }
518                // If the next character is not 'x', then these are just regular characters ...
519                if (next != 'x') {
520                    sb.append(c).append(next);
521                    continue;
522                }
523                // The next character is 'x', so write out the '_' character in encoded form ...
524                sb.append("_x005f_");
525                // And then write out the next character ...
526                sb.append(next);
527            } else if (iter.getIndex() == 0 && XML_NAME_START_ALLOWED_CHARACTERS.get(c)) {
528                // The fist only allows a subset from the total list of characters
529                sb.append(c);
530            } else if (iter.getIndex() > 0 && XML_NAME_ALLOWED_CHARACTERS.get(c)) {
531                // Legal characters for an XML Name ...
532                sb.append(c);
533            } else {
534                // All other characters must be escaped with '_xHHHH_' where 'HHHH' is the hex string for the code point
535                hex = Integer.toHexString(c);
536                // The hex string excludes the leading '0's, so check the character values so we know how many to prepend
537                if (c >= '\u0000' && c <= '\u000f') {
538                    sb.append("_x000").append(hex);
539                } else if (c >= '\u0010' && c <= '\u00ff') {
540                    sb.append("_x00").append(hex);
541                } else if (c >= '\u0100' && c <= '\u0fff') {
542                    sb.append("_x0").append(hex);
543                } else {
544                    sb.append("_x").append(hex);
545                }
546                sb.append('_');
547            }
548        }
549        return sb.toString();
550    }
551
552}