001/* 002 * ModeShape (http://www.modeshape.org) 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package org.modeshape.common.text; 017 018import java.util.ArrayList; 019import java.util.Iterator; 020import java.util.List; 021import java.util.ListIterator; 022import java.util.Locale; 023import java.util.NoSuchElementException; 024import org.modeshape.common.CommonI18n; 025import org.modeshape.common.annotation.Immutable; 026import org.modeshape.common.annotation.NotThreadSafe; 027import org.modeshape.common.util.CheckArg; 028import org.modeshape.common.xml.XmlCharacters; 029 030/** 031 * A foundation for basic parsers that tokenizes input content and allows parsers to easily access and use those tokens. A 032 * {@link TokenStream} object literally represents the stream of {@link Token} objects that each represent a word, symbol, comment 033 * or other lexically-relevant piece of information. This simple framework makes it very easy to create a parser that walks 034 * through (or "consumes") the tokens in the order they appear and do something useful with that content (usually creating another 035 * representation of the content, such as some domain-specific Abstract Syntax Tree or object model). 036 * <p> 037 * </p> 038 * <h3>The parts</h3> 039 * <p> 040 * This simple framework consists of a couple of pieces that fit together to do the whole job of parsing input content. 041 * </p> 042 * <p> 043 * The {@link Tokenizer} is responsible for consuming the character-level input content and constructing {@link Token} objects for 044 * the different words, symbols, or other meaningful elements contained in the content. Each Token object is a simple object that 045 * records the character(s) that make up the token's value, but it does this in a very lightweight and efficient way by pointing 046 * to the original character stream. Each token can be assigned a parser-specific integral <i>token type</i> that may make it 047 * easier to do quickly figure out later in the process what kind of information each token represents. The general idea is to 048 * keep the Tokenizer logic very simple, and very often Tokenizers will merely look for the different kinds of characters (e.g., 049 * symbols, letters, digits, etc.) as well as things like quoted strings and comments. However, Tokenizers are never called by the 050 * parser, but instead are always given to the TokenStream that then calls the Tokenizer at the appropriate time. 051 * </p> 052 * <p> 053 * The {@link TokenStream} is supplied the input content, a Tokenizer implementation, and a few options. Its job is to prepare the 054 * content for processing, call the Tokenizer implementation to create the series of Token objects, and then provide an interface 055 * for walking through and consuming the tokens. This interface makes it possible to discover the value and type of the current 056 * token, and consume the current token and move to the next token. Plus, the interface has been designed to make the code that 057 * works with the tokens to be as readable as possible. 058 * </p> 059 * <p> 060 * The final component in this framework is the <b>Parser</b>. The parser is really any class that takes as input the content to 061 * be parsed and that outputs some meaningful information. The parser will do this by defining the Tokenizer, constructing a 062 * TokenStream object, and then using the TokenStream to walk through the sequence of Tokens and produce some meaningful 063 * representation of the content. Parsers can create instances of some object model, or they can create a domain-specific Abstract 064 * Syntax Tree representation. 065 * </p> 066 * <p> 067 * The benefit of breaking the responsibility along these lines is that the TokenStream implementation is able to encapsulate 068 * quite a bit of very tedious and very useful functionality, while still allowing a lot of flexibility as to what makes up the 069 * different tokens. It also makes the parser very easy to write and read (and thus maintain), without placing very many 070 * restrictions on how that logic is to be defined. Plus, because the TokenStream takes responsibility for tracking the positions 071 * of every token (including line and column numbers), it can automatically produce meaningful errors. 072 * </p> 073 * <h3>Consuming tokens</h3> 074 * <p> 075 * A parser works with the tokens on the TokenStream using a variety of methods: 076 * <ul> 077 * <li>The {@link #start()} method must be called before any of the other methods. It performs initialization and tokenizing, and 078 * prepares the internal state by finding the first token and setting an internal <i>current token</i> reference.</li> 079 * <li>The {@link #hasNext()} method can be called repeatedly to determine if there is another token after the <i>current 080 * token</i>. This is often useful when an unknown number of tokens is to be processed, and behaves very similarly to the 081 * {@link Iterator#hasNext()} method.</li> 082 * <li>The {@link #consume()} method returns the {@link Token#value() value} of the <i>current token</i> and moves the <i>current 083 * token</i> pointer to the next available token.</li> 084 * <li>The {@link #consume(String)} and {@link #consume(char)} methods look at the <i>current token</i> and ensure the token's 085 * {@link Token#value() value} matches the value supplied as a method parameter, or they throw a {@link ParsingException} if the 086 * values don't match. The {@link #consume(int)} method works similarly, except that it attempts to match the token's 087 * {@link Token#type() type}. And, the {@link #consume(String, String...)} is a convenience method that is equivalent to calling 088 * {@link #consume(String)} for each of the arguments.</li> 089 * <li>The {@link #canConsume(String)} and {@link #canConsume(char)} methods look at the <i>current token</i> and check whether 090 * the token's {@link Token#value() value} matches the value supplied as a method parameter. If there is a match, the method 091 * advances the <i>current token</i> reference and returns true. Otherwise, the <i>current token</i> does not match and the method 092 * returns false without advancing the <i>current token</i> reference or throwing a ParsingException. Similarly, the 093 * {@link #canConsume(int)} method checks the token's {@link Token#type() type} rather than the value, consuming the token and 094 * returning true if there is a match, or just returning false if there is no match. The {@link #canConsume(String, String...)} 095 * method determines whether all of the supplied values can be consumed in the given order.</li> 096 * <li>The {@link #matches(String)} and {@link #matches(char)} methods look at the <i>current token</i> and check whether the 097 * token's {@link Token#value() value} matches the value supplied as a method parameter. The method then returns whether there was 098 * a match, but does <i>not</i> advance the <i>current token</i> pointer. Similarly, the {@link #matches(int)} method checks the 099 * token's {@link Token#type() type} rather than the value. The {@link #matches(String, String...)} method is a convenience method 100 * that is equivalent to calling {@link #matches(String)} for each of the arguments, and the {@link #matches(int, int...)} method 101 * is a convenience method that is equivalent to calling {@link #matches(int)} for each of the arguments.</li> 102 * </ul> 103 * <li>The {@link #matchesAnyOf(String, String...)} methods look at the <i>current token</i> and check whether the token's 104 * {@link Token#value() value} matches at least one of the values supplied as method parameters. The method then returns whether 105 * there was a match, but does <i>not</i> advance the <i>current token</i> pointer. Similarly, the 106 * {@link #matchesAnyOf(int, int...)} method checks the token's {@link Token#type() type} rather than the value.</li> </ul> 107 * </p> 108 * <p> 109 * With these methods, it's very easy to create a parser that looks at the current token to decide what to do, and then consume 110 * that token, and repeat this process. 111 * </p> 112 * <h3>Example parser</h3> 113 * <p> 114 * Here is an example of a very simple parser that parses very simple and limited SQL <code>SELECT</code> and <code>DELETE</code> 115 * statements, such as <code>SELECT * FROM Customers</code> or 116 * <code>SELECT Name, StreetAddress AS Address, City, Zip FROM Customers</code> or 117 * <code>DELETE FROM Customers WHERE Zip=12345</code>: 118 * 119 * <pre> 120 * public class SampleSqlSelectParser { 121 * public List<Statement> parse( String ddl ) { 122 * TokenStream tokens = new TokenStream(ddl, new SqlTokenizer(), false); 123 * List<Statement> statements = new LinkedList<Statement>(); 124 * token.start(); 125 * while (tokens.hasNext()) { 126 * if (tokens.matches("SELECT")) { 127 * statements.add(parseSelect(tokens)); 128 * } else { 129 * statements.add(parseDelete(tokens)); 130 * } 131 * } 132 * return statements; 133 * } 134 * 135 * protected Select parseSelect( TokenStream tokens ) throws ParsingException { 136 * tokens.consume("SELECT"); 137 * List<Column> columns = parseColumns(tokens); 138 * tokens.consume("FROM"); 139 * String tableName = tokens.consume(); 140 * return new Select(tableName, columns); 141 * } 142 * 143 * protected List<Column> parseColumns( TokenStream tokens ) throws ParsingException { 144 * List<Column> columns = new LinkedList<Column>(); 145 * if (tokens.matches('*')) { 146 * tokens.consume(); // leave the columns empty to signal wildcard 147 * } else { 148 * // Read names until we see a ',' 149 * do { 150 * String columnName = tokens.consume(); 151 * if (tokens.canConsume("AS")) { 152 * String columnAlias = tokens.consume(); 153 * columns.add(new Column(columnName, columnAlias)); 154 * } else { 155 * columns.add(new Column(columnName, null)); 156 * } 157 * } while (tokens.canConsume(',')); 158 * } 159 * return columns; 160 * } 161 * 162 * protected Delete parseDelete( TokenStream tokens ) throws ParsingException { 163 * tokens.consume("DELETE", "FROM"); 164 * String tableName = tokens.consume(); 165 * tokens.consume("WHERE"); 166 * String lhs = tokens.consume(); 167 * tokens.consume('='); 168 * String rhs = tokens.consume(); 169 * return new Delete(tableName, new Criteria(lhs, rhs)); 170 * } 171 * } 172 * public abstract class Statement { ... } 173 * public class Query extends Statement { ... } 174 * public class Delete extends Statement { ... } 175 * public class Column { ... } 176 * </pre> 177 * 178 * This example shows an idiomatic way of writing a parser that is stateless and thread-safe. The <code>parse(...)</code> method 179 * takes the input as a parameter, and returns the domain-specific representation that resulted from the parsing. All other 180 * methods are utility methods that simply encapsulate common logic or make the code more readable. 181 * </p> 182 * <p> 183 * In the example, the <code>parse(...)</code> first creates a TokenStream object (using a Tokenizer implementation that is not 184 * shown), and then loops as long as there are more tokens to read. As it loops, if the next token is "SELECT", the parser calls 185 * the <code>parseSelect(...)</code> method which immediately consumes a "SELECT" token, the names of the columns separated by 186 * commas (or a '*' if there all columns are to be selected), a "FROM" token, and the name of the table being queried. The 187 * <code>parseSelect(...)</code> method returns a <code>Select</code> object, which then added to the list of statements in the 188 * <code>parse(...)</code> method. The parser handles the "DELETE" statements in a similar manner. 189 * </p> 190 * <h3>Case sensitivity</h3> 191 * <p> 192 * Very often grammars to not require the case of keywords to match. This can make parsing a challenge, because all combinations 193 * of case need to be used. The TokenStream framework provides a very simple solution that requires no more effort than providing 194 * a boolean parameter to the constructor. 195 * </p> 196 * <p> 197 * When a <code>false</code> value is provided for the the <code>caseSensitive</code> parameter, the TokenStream performs all 198 * matching operations as if each token's value were in uppercase only. This means that the arguments supplied to the 199 * <code>match(...)</code>, <code>canConsume(...)</code>, and <code>consume(...)</code> methods should be upper-cased. Note that 200 * the <i>actual value</i> of each token remains the <i>actual</i> case as it appears in the input. 201 * </p> 202 * <p> 203 * Of course, when the TokenStream is created with a <code>true</code> value for the <code>caseSensitive</code> parameter, the 204 * matching is performed using the <i>actual</i> value as it appears in the input content 205 * </p> 206 * <h3>Whitespace</h3> 207 * <p> 208 * Many grammars are independent of lines breaks or whitespace, allowing a lot of flexibility when writing the content. The 209 * TokenStream framework makes it very easy to ignore line breaks and whitespace. To do so, the Tokenizer implementation must 210 * simply not include the line break character sequences and whitespace in the token ranges. Since none of the tokens contain 211 * whitespace, the parser never has to deal with them. 212 * </p> 213 * <p> 214 * Of course, many parsers will require that some whitespace be included. For example, whitespace within a quoted string may be 215 * needed by the parser. In this case, the Tokenizer should simply include the whitespace characters in the tokens. 216 * </p> 217 * <h3>Writing a Tokenizer</h3> 218 * <p> 219 * Each parser will likely have its own {@link Tokenizer} implementation that contains the parser-specific logic about how to 220 * break the content into token objects. Generally, the easiest way to do this is to simply iterate through the character sequence 221 * passed into the {@link Tokenizer#tokenize(TokenStream.CharacterStream, TokenStream.Tokens) tokenize(...)} method, and use a switch statement to decide 222 * what to do. 223 * </p> 224 * <p> 225 * Here is the code for a very basic Tokenizer implementation that ignores whitespace, line breaks and Java-style (multi-line and 226 * end-of-line) comments, while constructing single tokens for each quoted string. 227 * 228 * <pre> 229 * public class BasicTokenizer implements Tokenizer { 230 * public void tokenize( CharacterStream input, 231 * Tokens tokens ) throws ParsingException { 232 * while (input.hasNext()) { 233 * char c = input.next(); 234 * switch (c) { 235 * case ' ': 236 * case '\t': 237 * case '\n': 238 * case '\r': 239 * // Just skip these whitespace characters ... 240 * break; 241 * case '-': 242 * case '(': 243 * case ')': 244 * case '{': 245 * case '}': 246 * case '*': 247 * case ',': 248 * case ';': 249 * case '+': 250 * case '%': 251 * case '?': 252 * case '$': 253 * case '[': 254 * case ']': 255 * case '!': 256 * case '<': 257 * case '>': 258 * case '|': 259 * case '=': 260 * case ':': 261 * tokens.addToken(input.index(), input.index() + 1, SYMBOL); 262 * break; 263 * case '.': 264 * tokens.addToken(input.index(), input.index() + 1, DECIMAL); 265 * break; 266 * case '\"': 267 * case '\"': 268 * int startIndex = input.index(); 269 * Position startingPosition = input.position(); 270 * boolean foundClosingQuote = false; 271 * while (input.hasNext()) { 272 * c = input.next(); 273 * if (c == '\\' && input.isNext('"')) { 274 * c = input.next(); // consume the ' character since it is escaped 275 * } else if (c == '"') { 276 * foundClosingQuote = true; 277 * break; 278 * } 279 * } 280 * if (!foundClosingQuote) { 281 * throw new ParsingException(startingPosition, "No matching closing double quote found"); 282 * } 283 * int endIndex = input.index() + 1; // beyond last character read 284 * tokens.addToken(startIndex, endIndex, DOUBLE_QUOTED_STRING); 285 * break; 286 * case '\'': 287 * startIndex = input.index(); 288 * startingPosition = input.position(); 289 * foundClosingQuote = false; 290 * while (input.hasNext()) { 291 * c = input.next(); 292 * if (c == '\\' && input.isNext('\'')) { 293 * c = input.next(); // consume the ' character since it is escaped 294 * } else if (c == '\'') { 295 * foundClosingQuote = true; 296 * break; 297 * } 298 * } 299 * if (!foundClosingQuote) { 300 * throw new ParsingException(startingPosition, "No matching closing single quote found"); 301 * } 302 * endIndex = input.index() + 1; // beyond last character read 303 * tokens.addToken(startIndex, endIndex, SINGLE_QUOTED_STRING); 304 * break; 305 * case '/': 306 * startIndex = input.index(); 307 * if (input.isNext('/')) { 308 * // End-of-line comment ... 309 * boolean foundLineTerminator = false; 310 * while (input.hasNext()) { 311 * c = input.next(); 312 * if (c == '\n' || c == '\r') { 313 * foundLineTerminator = true; 314 * break; 315 * } 316 * } 317 * endIndex = input.index(); // the token won't include the '\n' or '\r' character(s) 318 * if (!foundLineTerminator) ++endIndex; // must point beyond last char 319 * if (c == '\r' && input.isNext('\n')) input.next(); 320 * if (useComments) { 321 * tokens.addToken(startIndex, endIndex, COMMENT); 322 * } 323 * } else if (input.isNext('*')) { 324 * // Multi-line comment ... 325 * while (input.hasNext() && !input.isNext('*', '/')) { 326 * c = input.next(); 327 * } 328 * if (input.hasNext()) input.next(); // consume the '*' 329 * if (input.hasNext()) input.next(); // consume the '/' 330 * if (useComments) { 331 * endIndex = input.index() + 1; // the token will include the '/' and '*' characters 332 * tokens.addToken(startIndex, endIndex, COMMENT); 333 * } 334 * } else { 335 * // just a regular slash ... 336 * tokens.addToken(startIndex, startIndex + 1, SYMBOL); 337 * } 338 * break; 339 * default: 340 * startIndex = input.index(); 341 * // Read until another whitespace/symbol/decimal/slash is found 342 * while (input.hasNext() && !(input.isNextWhitespace() || input.isNextAnyOf("/.-(){}*,;+%?$[]!<>|=:"))) { 343 * c = input.next(); 344 * } 345 * endIndex = input.index() + 1; // beyond last character that was included 346 * tokens.addToken(startIndex, endIndex, WORD); 347 * } 348 * } 349 * } 350 * } 351 * </pre> 352 * Tokenizers with exactly this behavior can actually be created using the {@link #basicTokenizer(boolean)} method. So while this very 353 * basic implementation is not meant to be used in all situations, it may be useful in some situations. 354 * </p> 355 */ 356@NotThreadSafe 357public class TokenStream { 358 359 /** 360 * A constant that can be used with the {@link #matches(String)}, {@link #matches(String, String...)}, 361 * {@link #consume(String)}, {@link #consume(String, String...)}, {@link #canConsume(String)} and 362 * {@link #canConsume(String, String...)} methods to signal that any value is allowed to be matched. 363 * <p> 364 * Note that this exact instance must be used; an equivalent string will not work. 365 * </p> 366 */ 367 public static final String ANY_VALUE = "any value"; 368 /** 369 * A constant that can be used with the {@link #matches(int)}, {@link #matches(int, int...)}, {@link #consume(int)}, and 370 * {@link #canConsume(int)} methods to signal that any token type is allowed to be matched. 371 */ 372 public static final int ANY_TYPE = Integer.MIN_VALUE; 373 374 protected final String inputString; 375 private final char[] inputContent; 376 private final boolean caseSensitive; 377 private final Tokenizer tokenizer; 378 private List<Token> tokens; 379 /** 380 * This class navigates the Token objects using this iterator. However, because it very often needs to access the 381 * "current token" in the "consume(...)" and "canConsume(...)" and "matches(...)" methods, the class caches a "current token" 382 * and makes this iterator point to the 2nd token. 383 * 384 * <pre> 385 * T1 T2 T3 T4 T5 386 * ˆ ˆ ˆ 387 * | | | 388 * | | +- The position of the tokenIterator, where tokenIterator.hasNext() will return T3 389 * | +---- The token referenced by currentToken 390 * +-------- The logical position of the TokenStream object, where the "consume()" would return T2 391 * </pre> 392 */ 393 private ListIterator<Token> tokenIterator; 394 private Token currentToken; 395 private boolean completed; 396 397 public TokenStream( String content, 398 Tokenizer tokenizer, 399 boolean caseSensitive ) { 400 CheckArg.isNotNull(content, "content"); 401 CheckArg.isNotNull(tokenizer, "tokenizer"); 402 this.inputString = content; 403 this.inputContent = content.toCharArray(); 404 this.caseSensitive = caseSensitive; 405 this.tokenizer = tokenizer; 406 } 407 408 /** 409 * Begin the token stream, including (if required) the tokenization of the input content. 410 * 411 * @return this object for easy method chaining; never null 412 * @throws ParsingException if an error occurs during tokenization of the content 413 */ 414 public TokenStream start() throws ParsingException { 415 // Create the tokens ... 416 if (tokens == null) { 417 TokenFactory tokenFactory = caseSensitive ? new CaseSensitiveTokenFactory() : new CaseInsensitiveTokenFactory(); 418 CharacterStream characterStream = new CharacterArrayStream(inputContent); 419 tokenizer.tokenize(characterStream, tokenFactory); 420 this.tokens = initializeTokens(tokenFactory.getTokens()); 421 } 422 423 // Create the iterator ... 424 tokenIterator = this.tokens.listIterator(); 425 moveToNextToken(); 426 return this; 427 } 428 429 /** 430 * Method to allow subclasses to preprocess the set of tokens and return the correct tokens to use. The default behavior is to 431 * simply return the supplied tokens. 432 * 433 * @param tokens 434 * @return list of tokens. 435 */ 436 protected List<Token> initializeTokens( List<Token> tokens ) { 437 return tokens; 438 } 439 440 /** 441 * Method to allow tokens to be re-used from the start without re-tokenizing content. 442 */ 443 public void rewind() { 444 // recreate the iterator ... 445 tokenIterator = this.tokens.listIterator(); 446 completed = false; 447 currentToken = null; 448 moveToNextToken(); 449 } 450 451 /** 452 * Get the position of the previous token. 453 * 454 * @return the previous token's position; never null 455 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 456 * @throws NoSuchElementException if there is no previous token 457 */ 458 public Position previousPosition() { 459 return previousToken().position(); 460 } 461 462 /** 463 * Get the position of the next (or current) token. 464 * 465 * @return the current token's position; never null 466 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 467 * @throws NoSuchElementException if there is no previous token 468 */ 469 public Position nextPosition() { 470 return currentToken().position(); 471 } 472 473 /** 474 * Convert the value of this token to an integer, return it, and move to the next token. 475 * 476 * @return the current token's value, converted to an integer 477 * @throws ParsingException if there is no such token to consume, or if the token cannot be converted to an integer 478 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 479 */ 480 public int consumeInteger() throws ParsingException, IllegalStateException { 481 if (completed) throwNoMoreContent(); 482 // Get the value from the current token ... 483 String value = currentToken().value(); 484 try { 485 int result = Integer.parseInt(value); 486 moveToNextToken(); 487 return result; 488 } catch (NumberFormatException e) { 489 Position position = currentToken().position(); 490 String msg = CommonI18n.expectingValidIntegerAtLineAndColumn.text(value, position.getLine(), position.getColumn()); 491 throw new ParsingException(position, msg); 492 } 493 } 494 495 /** 496 * Convert the value of this token to a long, return it, and move to the next token. 497 * 498 * @return the current token's value, converted to an integer 499 * @throws ParsingException if there is no such token to consume, or if the token cannot be converted to a long 500 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 501 */ 502 public long consumeLong() throws ParsingException, IllegalStateException { 503 if (completed) throwNoMoreContent(); 504 // Get the value from the current token ... 505 String value = currentToken().value(); 506 try { 507 long result = Long.parseLong(value); 508 moveToNextToken(); 509 return result; 510 } catch (NumberFormatException e) { 511 Position position = currentToken().position(); 512 String msg = CommonI18n.expectingValidLongAtLineAndColumn.text(value, position.getLine(), position.getColumn()); 513 throw new ParsingException(position, msg); 514 } 515 } 516 517 /** 518 * Convert the value of this token to an integer, return it, and move to the next token. 519 * 520 * @return the current token's value, converted to an integer 521 * @throws ParsingException if there is no such token to consume, or if the token cannot be converted to an integer 522 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 523 */ 524 public boolean consumeBoolean() throws ParsingException, IllegalStateException { 525 if (completed) throwNoMoreContent(); 526 // Get the value from the current token ... 527 String value = currentToken().value(); 528 try { 529 boolean result = Boolean.parseBoolean(value); 530 moveToNextToken(); 531 return result; 532 } catch (NumberFormatException e) { 533 Position position = currentToken().position(); 534 String msg = CommonI18n.expectingValidBooleanAtLineAndColumn.text(value, position.getLine(), position.getColumn()); 535 throw new ParsingException(position, msg); 536 } 537 } 538 539 /** 540 * Return the value of this token and move to the next token. 541 * 542 * @return the value of the current token 543 * @throws ParsingException if there is no such token to consume 544 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 545 */ 546 public String consume() throws ParsingException, IllegalStateException { 547 if (completed) throwNoMoreContent(); 548 // Get the value from the current token ... 549 String result = currentToken().value(); 550 moveToNextToken(); 551 return result; 552 } 553 554 protected void throwNoMoreContent() throws ParsingException { 555 String msg = CommonI18n.noMoreContent.text(); 556 Position pos = tokens.isEmpty() ? new Position(-1, 1, 0) : tokens.get(tokens.size() - 1).position(); 557 throw new ParsingException(pos, msg); 558 } 559 560 /** 561 * Attempt to consume this current token as long as it matches the expected value, or throw an exception if the token does not 562 * match. 563 * <p> 564 * The {@link #ANY_VALUE ANY_VALUE} constant can be used in the expected values as a wildcard. 565 * </p> 566 * 567 * @param expected the expected value of the current token 568 * @throws ParsingException if the current token doesn't match the supplied value 569 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 570 */ 571 public void consume( String expected ) throws ParsingException, IllegalStateException { 572 if (completed) { 573 String msg = CommonI18n.noMoreContentButWasExpectingToken.text(expected); 574 throw new ParsingException(tokens.get(tokens.size() - 1).position(), msg); 575 } 576 // Get the value from the current token ... 577 if (expected != ANY_VALUE && !currentToken().matches(expected)) { 578 String found = currentToken().value(); 579 Position pos = currentToken().position(); 580 String fragment = generateFragment(); 581 String msg = CommonI18n.unexpectedToken.text(expected, found, pos.getLine(), pos.getColumn(), fragment); 582 throw new ParsingException(pos, msg); 583 } 584 moveToNextToken(); 585 } 586 587 /** 588 * Attempt to consume this current token as long as it matches the expected character, or throw an exception if the token does 589 * not match. 590 * 591 * @param expected the expected character of the current token 592 * @throws ParsingException if the current token doesn't match the supplied value 593 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 594 */ 595 public void consume( char expected ) throws ParsingException, IllegalStateException { 596 if (completed) { 597 String msg = CommonI18n.noMoreContentButWasExpectingCharacter.text(expected); 598 throw new ParsingException(tokens.get(tokens.size() - 1).position(), msg); 599 } 600 // Get the value from the current token ... 601 if (!currentToken().matches(expected)) { 602 String found = currentToken().value(); 603 Position pos = currentToken().position(); 604 String fragment = generateFragment(); 605 String msg = CommonI18n.unexpectedCharacter.text(expected, found, pos.getLine(), pos.getColumn(), fragment); 606 throw new ParsingException(pos, msg); 607 } 608 moveToNextToken(); 609 } 610 611 /** 612 * Attempt to consume this current token as long as it matches the expected character, or throw an exception if the token does 613 * not match. 614 * <p> 615 * The {@link #ANY_TYPE ANY_TYPE} constant can be used in the expected values as a wildcard. 616 * </p> 617 * 618 * @param expectedType the expected token type of the current token 619 * @throws ParsingException if the current token doesn't match the supplied value 620 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 621 */ 622 public void consume( int expectedType ) throws ParsingException, IllegalStateException { 623 if (completed) { 624 String msg = CommonI18n.noMoreContentButWasExpectingTokenType.text(expectedType); 625 throw new ParsingException(tokens.get(tokens.size() - 1).position(), msg); 626 } 627 // Get the value from the current token ... 628 if (expectedType != ANY_TYPE && currentToken().type() != expectedType) { 629 String found = currentToken().value(); 630 Position pos = currentToken().position(); 631 String fragment = generateFragment(); 632 String msg = CommonI18n.unexpectedTokenType.text(expectedType, found, pos.getLine(), pos.getColumn(), fragment); 633 throw new ParsingException(pos, msg); 634 } 635 moveToNextToken(); 636 } 637 638 /** 639 * Attempt to consume this current token as the next tokens as long as they match the expected values, or throw an exception 640 * if the token does not match. 641 * <p> 642 * The {@link #ANY_VALUE ANY_VALUE} constant can be used in the expected values as a wildcard. 643 * </p> 644 * 645 * @param expected the expected value of the current token 646 * @param expectedForNextTokens the expected values fo the following tokens 647 * @throws ParsingException if the current token doesn't match the supplied value 648 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 649 */ 650 public void consume( String expected, 651 String... expectedForNextTokens ) throws ParsingException, IllegalStateException { 652 consume(expected); 653 for (String nextExpected : expectedForNextTokens) { 654 consume(nextExpected); 655 } 656 } 657 658 /** 659 * Attempt to consume this current token as the next tokens as long as they match the expected values, or throw an exception 660 * if the token does not match. 661 * <p> 662 * The {@link #ANY_VALUE ANY_VALUE} constant can be used in the expected values as a wildcard. 663 * </p> 664 * 665 * @param nextTokens the expected values for the next tokens 666 * @throws ParsingException if the current token doesn't match the supplied value 667 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 668 */ 669 public void consume( String[] nextTokens ) throws ParsingException, IllegalStateException { 670 for (String nextExpected : nextTokens) { 671 consume(nextExpected); 672 } 673 } 674 675 /** 676 * Attempt to consume this current token as the next tokens as long as they match the expected values, or throw an exception 677 * if the token does not match. 678 * <p> 679 * The {@link #ANY_VALUE ANY_VALUE} constant can be used in the expected values as a wildcard. 680 * </p> 681 * 682 * @param nextTokens the expected values for the next tokens 683 * @throws ParsingException if the current token doesn't match the supplied value 684 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 685 */ 686 public void consume( Iterable<String> nextTokens ) throws ParsingException, IllegalStateException { 687 for (String nextExpected : nextTokens) { 688 consume(nextExpected); 689 } 690 } 691 692 /** 693 * Attempt to consume this current token if it matches the expected value, and return whether this method was indeed able to 694 * consume the token. 695 * <p> 696 * The {@link #ANY_VALUE ANY_VALUE} constant can be used in the expected value as a wildcard. 697 * </p> 698 * 699 * @param expected the expected value of the current token token 700 * @return true if the current token did match and was consumed, or false if the current token did not match and therefore was 701 * not consumed 702 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 703 */ 704 public boolean canConsume( String expected ) throws IllegalStateException { 705 if (!matches(expected)) return false; 706 moveToNextToken(); 707 return true; 708 } 709 710 /** 711 * Attempt to consume this current token if it matches the expected value, and return whether this method was indeed able to 712 * consume the token. 713 * 714 * @param expected the expected value of the current token token 715 * @return true if the current token did match and was consumed, or false if the current token did not match and therefore was 716 * not consumed 717 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 718 */ 719 public boolean canConsume( char expected ) throws IllegalStateException { 720 if (!matches(expected)) return false; 721 moveToNextToken(); 722 return true; 723 } 724 725 /** 726 * Attempt to consume this current token if it matches the expected token type, and return whether this method was indeed able 727 * to consume the token. 728 * <p> 729 * The {@link #ANY_TYPE ANY_TYPE} constant can be used in the expected type as a wildcard. 730 * </p> 731 * 732 * @param expectedType the expected token type of the current token 733 * @return true if the current token did match and was consumed, or false if the current token did not match and therefore was 734 * not consumed 735 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 736 */ 737 public boolean canConsume( int expectedType ) throws IllegalStateException { 738 if (!matches(expectedType)) return false; 739 moveToNextToken(); 740 return true; 741 } 742 743 /** 744 * Attempt to consume this current token and the next tokens if and only if they match the expected values, and return whether 745 * this method was indeed able to consume all of the supplied tokens. 746 * <p> 747 * This is <i>not</i> the same as calling {@link #canConsume(String)} for each of the supplied arguments, since this method 748 * ensures that <i>all</i> of the supplied values can be consumed. 749 * </p> 750 * <p> 751 * This method <i>is</i> equivalent to calling the following: 752 * 753 * <pre> 754 * 755 * if (tokens.matches(currentExpected, expectedForNextTokens)) { 756 * tokens.consume(currentExpected, expectedForNextTokens); 757 * } 758 * 759 * </pre> 760 * 761 * </p> 762 * <p> 763 * The {@link #ANY_VALUE ANY_VALUE} constant can be used in the expected values as a wildcard. 764 * </p> 765 * 766 * @param currentExpected the expected value of the current token 767 * @param expectedForNextTokens the expected values fo the following tokens 768 * @return true if the current token did match and was consumed, or false if the current token did not match and therefore was 769 * not consumed 770 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 771 */ 772 public boolean canConsume( String currentExpected, 773 String... expectedForNextTokens ) throws IllegalStateException { 774 if (completed) return false; 775 ListIterator<Token> iter = tokens.listIterator(tokenIterator.previousIndex()); 776 if (!iter.hasNext()) return false; 777 Token token = iter.next(); 778 if (currentExpected != ANY_VALUE && !token.matches(currentExpected)) return false; 779 for (String nextExpected : expectedForNextTokens) { 780 if (!iter.hasNext()) return false; 781 token = iter.next(); 782 if (nextExpected == ANY_VALUE) continue; 783 if (!token.matches(nextExpected)) return false; 784 } 785 this.tokenIterator = iter; 786 this.currentToken = tokenIterator.hasNext() ? tokenIterator.next() : null; 787 this.completed = this.currentToken == null; 788 return true; 789 } 790 791 /** 792 * Attempt to consume this current token and the next tokens if and only if they match the expected values, and return whether 793 * this method was indeed able to consume all of the supplied tokens. 794 * <p> 795 * This is <i>not</i> the same as calling {@link #canConsume(String)} for each of the supplied arguments, since this method 796 * ensures that <i>all</i> of the supplied values can be consumed. 797 * </p> 798 * <p> 799 * This method <i>is</i> equivalent to calling the following: 800 * 801 * <pre> 802 * 803 * if (tokens.matches(currentExpected, expectedForNextTokens)) { 804 * tokens.consume(currentExpected, expectedForNextTokens); 805 * } 806 * 807 * </pre> 808 * 809 * </p> 810 * <p> 811 * The {@link #ANY_VALUE ANY_VALUE} constant can be used in the expected values as a wildcard. 812 * </p> 813 * 814 * @param nextTokens the expected values of the next tokens 815 * @return true if the current token did match and was consumed, or false if the current token did not match and therefore was 816 * not consumed 817 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 818 */ 819 public boolean canConsume( String[] nextTokens ) throws IllegalStateException { 820 if (completed) return false; 821 ListIterator<Token> iter = tokens.listIterator(tokenIterator.previousIndex()); 822 Token token = null; 823 for (String nextExpected : nextTokens) { 824 if (!iter.hasNext()) return false; 825 token = iter.next(); 826 if (nextExpected == ANY_VALUE) continue; 827 if (!token.matches(nextExpected)) return false; 828 } 829 this.tokenIterator = iter; 830 this.currentToken = tokenIterator.hasNext() ? tokenIterator.next() : null; 831 this.completed = this.currentToken == null; 832 return true; 833 } 834 835 /** 836 * Attempt to consume this current token and the next tokens if and only if they match the expected values, and return whether 837 * this method was indeed able to consume all of the supplied tokens. 838 * <p> 839 * This is <i>not</i> the same as calling {@link #canConsume(String)} for each of the supplied arguments, since this method 840 * ensures that <i>all</i> of the supplied values can be consumed. 841 * </p> 842 * <p> 843 * This method <i>is</i> equivalent to calling the following: 844 * 845 * <pre> 846 * 847 * if (tokens.matches(currentExpected, expectedForNextTokens)) { 848 * tokens.consume(currentExpected, expectedForNextTokens); 849 * } 850 * 851 * </pre> 852 * 853 * </p> 854 * <p> 855 * The {@link #ANY_VALUE ANY_VALUE} constant can be used in the expected values as a wildcard. 856 * </p> 857 * 858 * @param nextTokens the expected values of the next tokens 859 * @return true if the current token did match and was consumed, or false if the current token did not match and therefore was 860 * not consumed 861 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 862 */ 863 public boolean canConsume( Iterable<String> nextTokens ) throws IllegalStateException { 864 if (completed) return false; 865 ListIterator<Token> iter = tokens.listIterator(tokenIterator.previousIndex()); 866 Token token = null; 867 for (String nextExpected : nextTokens) { 868 if (!iter.hasNext()) return false; 869 token = iter.next(); 870 if (nextExpected == ANY_VALUE) continue; 871 if (!token.matches(nextExpected)) return false; 872 } 873 this.tokenIterator = iter; 874 this.currentToken = tokenIterator.hasNext() ? tokenIterator.next() : null; 875 this.completed = this.currentToken == null; 876 return true; 877 } 878 879 /** 880 * Attempt to consume the next token if it matches one of the supplied values. 881 * 882 * @param firstOption the first option for the value of the current token 883 * @param additionalOptions the additional options for the value of the current token 884 * @return true if the current token's value did match one of the suplied options, or false otherwise 885 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 886 */ 887 public boolean canConsumeAnyOf( String firstOption, 888 String... additionalOptions ) throws IllegalStateException { 889 if (completed) return false; 890 if (canConsume(firstOption)) return true; 891 for (String nextOption : additionalOptions) { 892 if (canConsume(nextOption)) return true; 893 } 894 return false; 895 } 896 897 /** 898 * Attempt to consume the next token if it matches one of the supplied values. 899 * 900 * @param options the options for the value of the current token 901 * @return true if the current token's value did match one of the suplied options, or false otherwise 902 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 903 */ 904 public boolean canConsumeAnyOf( String[] options ) throws IllegalStateException { 905 if (completed) return false; 906 for (String option : options) { 907 if (canConsume(option)) return true; 908 } 909 return false; 910 } 911 912 /** 913 * Attempt to consume the next token if it matches one of the supplied values. 914 * 915 * @param options the options for the value of the current token 916 * @return true if the current token's value did match one of the suplied options, or false otherwise 917 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 918 */ 919 public boolean canConsumeAnyOf( Iterable<String> options ) throws IllegalStateException { 920 if (completed) return false; 921 for (String option : options) { 922 if (canConsume(option)) return true; 923 } 924 return false; 925 } 926 927 /** 928 * Attempt to consume the next token if it matches one of the supplied types. 929 * 930 * @param firstTypeOption the first option for the type of the current token 931 * @param additionalTypeOptions the additional options for the type of the current token 932 * @return true if the current token's type matched one of the supplied options, or false otherwise 933 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 934 */ 935 public boolean canConsumeAnyOf( int firstTypeOption, 936 int... additionalTypeOptions ) throws IllegalStateException { 937 if (completed) return false; 938 if (canConsume(firstTypeOption)) return true; 939 for (int nextTypeOption : additionalTypeOptions) { 940 if (canConsume(nextTypeOption)) return true; 941 } 942 return false; 943 } 944 945 /** 946 * Attempt to consume the next token if it matches one of the supplied types. 947 * 948 * @param typeOptions the options for the type of the current token 949 * @return true if the current token's type matched one of the supplied options, or false otherwise 950 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 951 */ 952 public boolean canConsumeAnyOf( int[] typeOptions ) throws IllegalStateException { 953 if (completed) return false; 954 for (int nextTypeOption : typeOptions) { 955 if (canConsume(nextTypeOption)) return true; 956 } 957 return false; 958 } 959 960 /** 961 * Determine if the current token matches the expected value. 962 * <p> 963 * The {@link #ANY_VALUE ANY_VALUE} constant can be used as a wildcard. 964 * </p> 965 * 966 * @param expected the expected value of the current token token 967 * @return true if the current token did match, or false if the current token did not match 968 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 969 */ 970 public boolean matches( String expected ) throws IllegalStateException { 971 return !completed && (expected == ANY_VALUE || currentToken().matches(expected)); 972 } 973 974 /** 975 * Determine if the current token matches the expected value. 976 * 977 * @param expected the expected value of the current token token 978 * @return true if the current token did match, or false if the current token did not match 979 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 980 */ 981 public boolean matches( char expected ) throws IllegalStateException { 982 return !completed && currentToken().matches(expected); 983 } 984 985 /** 986 * Determine if the current token matches the expected token type. 987 * 988 * @param expectedType the expected token type of the current token 989 * @return true if the current token did match, or false if the current token did not match 990 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 991 */ 992 public boolean matches( int expectedType ) throws IllegalStateException { 993 return !completed && currentToken().matches(expectedType); 994 } 995 996 /** 997 * Determine if the next few tokens match the expected values. 998 * <p> 999 * The {@link #ANY_VALUE ANY_VALUE} constant can be used in the expected values as a wildcard. 1000 * </p> 1001 * 1002 * @param currentExpected the expected value of the current token 1003 * @param expectedForNextTokens the expected values for the following tokens 1004 * @return true if the tokens did match, or false otherwise 1005 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 1006 */ 1007 public boolean matches( String currentExpected, 1008 String... expectedForNextTokens ) throws IllegalStateException { 1009 if (completed) return false; 1010 ListIterator<Token> iter = tokens.listIterator(tokenIterator.previousIndex()); 1011 if (!iter.hasNext()) return false; 1012 Token token = iter.next(); 1013 if (currentExpected != ANY_VALUE && !token.matches(currentExpected)) return false; 1014 for (String nextExpected : expectedForNextTokens) { 1015 if (!iter.hasNext()) return false; 1016 token = iter.next(); 1017 if (nextExpected == ANY_VALUE) continue; 1018 if (!token.matches(nextExpected)) return false; 1019 } 1020 return true; 1021 } 1022 1023 /** 1024 * Determine if the next few tokens match the expected values. 1025 * <p> 1026 * The {@link #ANY_VALUE ANY_VALUE} constant can be used in the expected values as a wildcard. 1027 * </p> 1028 * 1029 * @param nextTokens the expected value of the next tokens 1030 * @return true if the tokens did match, or false otherwise 1031 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 1032 */ 1033 public boolean matches( String[] nextTokens ) throws IllegalStateException { 1034 if (completed) return false; 1035 ListIterator<Token> iter = tokens.listIterator(tokenIterator.previousIndex()); 1036 Token token = null; 1037 for (String nextExpected : nextTokens) { 1038 if (!iter.hasNext()) return false; 1039 token = iter.next(); 1040 if (nextExpected == ANY_VALUE) continue; 1041 if (!token.matches(nextExpected)) return false; 1042 } 1043 return true; 1044 } 1045 1046 /** 1047 * Determine if the next few tokens match the expected values. 1048 * <p> 1049 * The {@link #ANY_VALUE ANY_VALUE} constant can be used in the expected values as a wildcard. 1050 * </p> 1051 * 1052 * @param nextTokens the expected value of the next tokens 1053 * @return true if the tokens did match, or false otherwise 1054 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 1055 */ 1056 public boolean matches( Iterable<String> nextTokens ) throws IllegalStateException { 1057 if (completed) return false; 1058 ListIterator<Token> iter = tokens.listIterator(tokenIterator.previousIndex()); 1059 Token token = null; 1060 for (String nextExpected : nextTokens) { 1061 if (!iter.hasNext()) return false; 1062 token = iter.next(); 1063 if (nextExpected == ANY_VALUE) continue; 1064 if (!token.matches(nextExpected)) return false; 1065 } 1066 return true; 1067 } 1068 1069 /** 1070 * Determine if the next few tokens have the supplied types. 1071 * <p> 1072 * The {@link #ANY_TYPE ANY_TYPE} constant can be used in the expected values as a wildcard. 1073 * </p> 1074 * 1075 * @param currentExpectedType the expected type of the current token 1076 * @param expectedTypeForNextTokens the expected type for the following tokens 1077 * @return true if the tokens did match, or false otherwise 1078 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 1079 */ 1080 public boolean matches( int currentExpectedType, 1081 int... expectedTypeForNextTokens ) throws IllegalStateException { 1082 if (completed) return false; 1083 ListIterator<Token> iter = tokens.listIterator(tokenIterator.previousIndex()); 1084 if (!iter.hasNext()) return false; 1085 Token token = iter.next(); 1086 if (currentExpectedType != ANY_TYPE && currentToken().type() != currentExpectedType) return false; 1087 for (int nextExpectedType : expectedTypeForNextTokens) { 1088 if (!iter.hasNext()) return false; 1089 token = iter.next(); 1090 if (nextExpectedType == ANY_TYPE) continue; 1091 if (token.type() != nextExpectedType) return false; 1092 } 1093 return true; 1094 } 1095 1096 /** 1097 * Determine if the next few tokens have the supplied types. 1098 * <p> 1099 * The {@link #ANY_TYPE ANY_TYPE} constant can be used in the expected values as a wildcard. 1100 * </p> 1101 * 1102 * @param typesForNextTokens the expected type for each of the next tokens 1103 * @return true if the tokens did match, or false otherwise 1104 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 1105 */ 1106 public boolean matches( int[] typesForNextTokens ) throws IllegalStateException { 1107 if (completed) return false; 1108 ListIterator<Token> iter = tokens.listIterator(tokenIterator.previousIndex()); 1109 Token token = null; 1110 for (int nextExpectedType : typesForNextTokens) { 1111 if (!iter.hasNext()) return false; 1112 token = iter.next(); 1113 if (nextExpectedType == ANY_TYPE) continue; 1114 if (!token.matches(nextExpectedType)) return false; 1115 } 1116 return true; 1117 } 1118 1119 /** 1120 * Determine if the next token matches one of the supplied values. 1121 * 1122 * @param firstOption the first option for the value of the current token 1123 * @param additionalOptions the additional options for the value of the current token 1124 * @return true if the current token's value did match one of the suplied options, or false otherwise 1125 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 1126 */ 1127 public boolean matchesAnyOf( String firstOption, 1128 String... additionalOptions ) throws IllegalStateException { 1129 if (completed) return false; 1130 Token current = currentToken(); 1131 if (current.matches(firstOption)) return true; 1132 for (String nextOption : additionalOptions) { 1133 if (current.matches(nextOption)) return true; 1134 } 1135 return false; 1136 } 1137 1138 /** 1139 * Determine if the next token matches one of the supplied values. 1140 * 1141 * @param options the options for the value of the current token 1142 * @return true if the current token's value did match one of the suplied options, or false otherwise 1143 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 1144 */ 1145 public boolean matchesAnyOf( String[] options ) throws IllegalStateException { 1146 if (completed) return false; 1147 Token current = currentToken(); 1148 for (String option : options) { 1149 if (current.matches(option)) return true; 1150 } 1151 return false; 1152 } 1153 1154 /** 1155 * Determine if the next token matches one of the supplied values. 1156 * 1157 * @param options the options for the value of the current token 1158 * @return true if the current token's value did match one of the suplied options, or false otherwise 1159 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 1160 */ 1161 public boolean matchesAnyOf( Iterable<String> options ) throws IllegalStateException { 1162 if (completed) return false; 1163 Token current = currentToken(); 1164 for (String option : options) { 1165 if (current.matches(option)) return true; 1166 } 1167 return false; 1168 } 1169 1170 /** 1171 * Determine if the next token have one of the supplied types. 1172 * 1173 * @param firstTypeOption the first option for the type of the current token 1174 * @param additionalTypeOptions the additional options for the type of the current token 1175 * @return true if the current token's type matched one of the supplied options, or false otherwise 1176 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 1177 */ 1178 public boolean matchesAnyOf( int firstTypeOption, 1179 int... additionalTypeOptions ) throws IllegalStateException { 1180 if (completed) return false; 1181 int currentType = currentToken().type(); 1182 if (currentType == firstTypeOption) return true; 1183 for (int nextTypeOption : additionalTypeOptions) { 1184 if (currentType == nextTypeOption) return true; 1185 } 1186 return false; 1187 } 1188 1189 /** 1190 * Determine if the next token have one of the supplied types. 1191 * 1192 * @param typeOptions the options for the type of the current token 1193 * @return true if the current token's type matched one of the supplied options, or false otherwise 1194 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 1195 */ 1196 public boolean matchesAnyOf( int[] typeOptions ) throws IllegalStateException { 1197 if (completed) return false; 1198 int currentType = currentToken().type(); 1199 for (int nextTypeOption : typeOptions) { 1200 if (currentType == nextTypeOption) return true; 1201 } 1202 return false; 1203 } 1204 1205 /** 1206 * Determine if this stream has another token to be consumed. 1207 * 1208 * @return true if there is another token ready for consumption, or false otherwise 1209 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 1210 */ 1211 public boolean hasNext() { 1212 if (tokenIterator == null) { 1213 throw new IllegalStateException(CommonI18n.startMethodMustBeCalledBeforeNext.text()); 1214 } 1215 return !completed; 1216 } 1217 1218 @Override 1219 public String toString() { 1220 ListIterator<Token> iter = tokens.listIterator(tokenIterator.previousIndex()); 1221 StringBuilder sb = new StringBuilder(); 1222 if (iter.hasNext()) { 1223 sb.append(iter.next()); 1224 int count = 1; 1225 while (iter.hasNext()) { 1226 if (count > 20) { 1227 sb.append(" ..."); 1228 break; 1229 } 1230 sb.append(" "); 1231 ++count; 1232 sb.append(iter.next()); 1233 } 1234 } 1235 return sb.toString(); 1236 } 1237 1238 private void moveToNextToken() { 1239 // And move the currentToken to the next token ... 1240 if (!tokenIterator.hasNext()) { 1241 completed = true; 1242 currentToken = null; 1243 } else { 1244 currentToken = tokenIterator.next(); 1245 } 1246 } 1247 1248 /** 1249 * Get the current token. 1250 * 1251 * @return the current token; never null 1252 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 1253 * @throws NoSuchElementException if there are no more tokens 1254 */ 1255 final Token currentToken() throws IllegalStateException, NoSuchElementException { 1256 if (currentToken == null) { 1257 if (completed) { 1258 throw new NoSuchElementException(CommonI18n.noMoreContent.text()); 1259 } 1260 throw new IllegalStateException(CommonI18n.startMethodMustBeCalledBeforeConsumingOrMatching.text()); 1261 } 1262 assert currentToken != null; 1263 return currentToken; 1264 } 1265 1266 /** 1267 * Gets the content string starting at the first position (inclusive) and continuing up to the end position (exclusive). 1268 * 1269 * @param starting the position marking the beginning of the desired content string. 1270 * @param end the position located directly after the returned content string; can be null, which means end of content 1271 * @return the content string; never null 1272 */ 1273 public String getContentBetween( Position starting, 1274 Position end ) { 1275 CheckArg.isNotNull(starting, "starting"); 1276 1277 int startIndex = starting.getIndexInContent(); 1278 int endIndex = inputString.length(); 1279 if (end != null) { 1280 endIndex = end.getIndexInContent(); 1281 } 1282 1283 if (startIndex >= endIndex) { 1284 throw new IllegalArgumentException(CommonI18n.endPositionMustBeGreaterThanStartingPosition.text(startIndex, endIndex)); 1285 } 1286 1287 return inputString.substring(startIndex, endIndex); 1288 } 1289 1290 /** 1291 * Get the previous token. This does not modify the state. 1292 * 1293 * @return the previous token; never null 1294 * @throws IllegalStateException if this method was called before the stream was {@link #start() started} 1295 * @throws NoSuchElementException if there is no previous token 1296 */ 1297 final Token previousToken() throws IllegalStateException, NoSuchElementException { 1298 if (currentToken == null) { 1299 if (completed) { 1300 if (tokens.isEmpty()) { 1301 throw new NoSuchElementException(CommonI18n.noMoreContent.text()); 1302 } 1303 return tokens.get(tokens.size() - 1); 1304 } 1305 throw new IllegalStateException(CommonI18n.startMethodMustBeCalledBeforeConsumingOrMatching.text()); 1306 } 1307 if (tokenIterator.previousIndex() == 0) { 1308 throw new NoSuchElementException(CommonI18n.noMoreContent.text()); 1309 } 1310 return tokens.get(tokenIterator.previousIndex() - 1); 1311 } 1312 1313 String generateFragment() { 1314 // Find the current position ... 1315 assert currentToken != null; 1316 int startIndex = currentToken.startIndex(); 1317 return generateFragment(inputString, startIndex, 20, " ===>> "); 1318 } 1319 1320 /** 1321 * Utility method to generate a highlighted fragment of a particular point in the stream. 1322 * 1323 * @param content the content from which the fragment should be taken; may not be null 1324 * @param indexOfProblem the index of the problem point that should be highlighted; must be a valid index in the content 1325 * @param charactersToIncludeBeforeAndAfter the maximum number of characters before and after the problem point to include in 1326 * the fragment 1327 * @param highlightText the text that should be included in the fragment at the problem point to highlight the location, or an 1328 * empty string if there should be no highlighting 1329 * @return the highlighted fragment; never null 1330 */ 1331 static String generateFragment( String content, 1332 int indexOfProblem, 1333 int charactersToIncludeBeforeAndAfter, 1334 String highlightText ) { 1335 assert content != null; 1336 assert indexOfProblem < content.length(); 1337 // Find the substring that immediately precedes the current position ... 1338 int beforeStart = Math.max(0, indexOfProblem - charactersToIncludeBeforeAndAfter); 1339 String before = content.substring(beforeStart, indexOfProblem); 1340 1341 // Find the substring that immediately follows the current position ... 1342 int afterEnd = Math.min(indexOfProblem + charactersToIncludeBeforeAndAfter, content.length()); 1343 String after = content.substring(indexOfProblem, afterEnd); 1344 1345 return before + (highlightText != null ? highlightText : "") + after; 1346 } 1347 1348 /** 1349 * Interface for a Tokenizer component responsible for processing the characters in a {@link CharacterStream} and constructing 1350 * the appropriate {@link Token} objects. 1351 */ 1352 public static interface Tokenizer { 1353 /** 1354 * Process the supplied characters and construct the appropriate {@link Token} objects. 1355 * 1356 * @param input the character input stream; never null 1357 * @param tokens the factory for {@link Token} objects, which records the order in which the tokens are created 1358 * @throws ParsingException if there is an error while processing the character stream (e.g., a quote is not closed, etc.) 1359 */ 1360 void tokenize( CharacterStream input, 1361 Tokens tokens ) throws ParsingException; 1362 } 1363 1364 /** 1365 * Interface used by a {@link Tokenizer} to iterate through the characters in the content input to the {@link TokenStream}. 1366 */ 1367 public static interface CharacterStream { 1368 1369 /** 1370 * Determine if there is another character available in this stream. 1371 * 1372 * @return true if there is another character (and {@link #next()} can be called), or false otherwise 1373 */ 1374 boolean hasNext(); 1375 1376 /** 1377 * Obtain the next character value, and advance the stream. 1378 * 1379 * @return the next character 1380 * @throws NoSuchElementException if there is no {@link #hasNext() next character} 1381 */ 1382 char next(); 1383 1384 /** 1385 * Get the index for the last character returned from {@link #next()}. 1386 * 1387 * @return the index of the last character returned 1388 */ 1389 int index(); 1390 1391 /** 1392 * Get the position for the last character returned from {@link #next()}. 1393 * 1394 * @param startIndex 1395 * @return the position of the last character returned; never null 1396 */ 1397 Position position( int startIndex ); 1398 1399 /** 1400 * Determine if the next character on the sream is a {@link Character#isWhitespace(char) whitespace character}. This 1401 * method does <i>not</i> advance the stream. 1402 * 1403 * @return true if there is a {@link #next() next} character and it is a whitespace character, or false otherwise 1404 */ 1405 boolean isNextWhitespace(); 1406 1407 /** 1408 * Determine if the next character on the sream is a {@link Character#isLetterOrDigit(char) letter or digit}. This method 1409 * does <i>not</i> advance the stream. 1410 * 1411 * @return true if there is a {@link #next() next} character and it is a letter or digit, or false otherwise 1412 */ 1413 boolean isNextLetterOrDigit(); 1414 1415 /** 1416 * Determine if the next character on the sream is a {@link XmlCharacters#isValid(int) valid XML character}. This method 1417 * does <i>not</i> advance the stream. 1418 * 1419 * @return true if there is a {@link #next() next} character and it is a valid XML character, or false otherwise 1420 */ 1421 boolean isNextValidXmlCharacter(); 1422 1423 /** 1424 * Determine if the next character on the sream is a {@link XmlCharacters#isValidName(int) valid XML NCName character}. 1425 * This method does <i>not</i> advance the stream. 1426 * 1427 * @return true if there is a {@link #next() next} character and it is a valid XML Name character, or false otherwise 1428 */ 1429 boolean isNextValidXmlNameCharacter(); 1430 1431 /** 1432 * Determine if the next character on the sream is a {@link XmlCharacters#isValidNcName(int) valid XML NCName character}. 1433 * This method does <i>not</i> advance the stream. 1434 * 1435 * @return true if there is a {@link #next() next} character and it is a valid XML NCName character, or false otherwise 1436 */ 1437 boolean isNextValidXmlNcNameCharacter(); 1438 1439 /** 1440 * Determine if the next character on the sream is the supplied value. This method does <i>not</i> advance the stream. 1441 * 1442 * @param c the character value to compare to the next character on the stream 1443 * @return true if there is a {@link #next() next} character and it is the supplied character, or false otherwise 1444 */ 1445 boolean isNext( char c ); 1446 1447 /** 1448 * Determine if the next two characters on the stream match the supplied values. This method does <i>not</i> advance the 1449 * stream. 1450 * 1451 * @param nextChar the character value to compare to the next character on the stream 1452 * @param followingChar the character value to compare to the character immediately after the next character on the stream 1453 * @return true if there are at least two characters left on the stream and the first matches <code>nextChar</code> and 1454 * the second matches <code>followingChar</code> 1455 */ 1456 boolean isNext( char nextChar, 1457 char followingChar ); 1458 1459 /** 1460 * Determine if the next three characters on the sream match the supplied values. This method does <i>not</i> advance the 1461 * stream. 1462 * 1463 * @param nextChar the character value to compare to the next character on the stream 1464 * @param nextChar2 the character value to compare to the second character on the stream 1465 * @param nextChar3 the character value to compare to the second character on the stream 1466 * @return true if there are at least two characters left on the stream and the first matches <code>nextChar</code> and 1467 * the second matches <code>followingChar</code> 1468 */ 1469 boolean isNext( char nextChar, 1470 char nextChar2, 1471 char nextChar3 ); 1472 1473 /** 1474 * Determine if the next character on the stream matches one of the supplied characters. This method does <i>not</i> 1475 * advance the stream. 1476 * 1477 * @param characters the characters to match 1478 * @return true if there is a {@link #next() next} character and it does match one of the supplied characters, or false 1479 * otherwise 1480 */ 1481 boolean isNextAnyOf( char[] characters ); 1482 1483 /** 1484 * Determine if the next character on the stream matches one of the supplied characters. This method does <i>not</i> 1485 * advance the stream. 1486 * 1487 * @param characters the characters to match 1488 * @return true if there is a {@link #next() next} character and it does match one of the supplied characters, or false 1489 * otherwise 1490 */ 1491 boolean isNextAnyOf( String characters ); 1492 1493 } 1494 1495 /** 1496 * A factory for Token objects, used by a {@link Tokenizer} to create tokens in the correct order. 1497 */ 1498 public static interface Tokens { 1499 /** 1500 * Create a single-character token at the supplied index in the character stream. The token type is set to 0, meaning this 1501 * is equivalent to calling <code>addToken(index,index+1)</code> or <code>addToken(index,index+1,0)</code>. 1502 * 1503 * @param position the position (line and column numbers) of this new token; may not be null 1504 * @param index the index of the character to appear in the token; must be a valid index in the stream 1505 */ 1506 void addToken( Position position, 1507 int index ); 1508 1509 /** 1510 * Create a single- or multi-character token with the characters in the range given by the starting and ending index in 1511 * the character stream. The character at the ending index is <i>not</i> included in the token (as this is standard 1512 * practice when using 0-based indexes). The token type is set to 0, meaning this is equivalent to calling <code> 1513 * addToken(startIndex,endIndex,0)</code> . 1514 * 1515 * @param position the position (line and column numbers) of this new token; may not be null 1516 * @param startIndex the index of the first character to appear in the token; must be a valid index in the stream 1517 * @param endIndex the index just past the last character to appear in the token; must be a valid index in the stream 1518 */ 1519 void addToken( Position position, 1520 int startIndex, 1521 int endIndex ); 1522 1523 /** 1524 * Create a single- or multi-character token with the supplied type and with the characters in the range given by the 1525 * starting and ending index in the character stream. The character at the ending index is <i>not</i> included in the 1526 * token (as this is standard practice when using 0-based indexes). 1527 * 1528 * @param position the position (line and column numbers) of this new token; may not be null 1529 * @param startIndex the index of the first character to appear in the token; must be a valid index in the stream 1530 * @param endIndex the index just past the last character to appear in the token; must be a valid index in the stream 1531 * @param type the type of the token 1532 */ 1533 void addToken( Position position, 1534 int startIndex, 1535 int endIndex, 1536 int type ); 1537 } 1538 1539 /** 1540 * The interface defining a token, which references the characters in the actual input character stream. 1541 * 1542 * @see CaseSensitiveTokenFactory 1543 * @see CaseInsensitiveTokenFactory 1544 */ 1545 @Immutable 1546 public interface Token { 1547 /** 1548 * Get the value of the token, in actual case. 1549 * 1550 * @return the value 1551 */ 1552 String value(); 1553 1554 /** 1555 * Determine if the token matches the supplied string. 1556 * 1557 * @param expected the expected value 1558 * @return true if the token's value matches the supplied value, or false otherwise 1559 */ 1560 boolean matches( String expected ); 1561 1562 /** 1563 * Determine if the token matches the supplied character. 1564 * 1565 * @param expected the expected character value 1566 * @return true if the token's value matches the supplied character value, or false otherwise 1567 */ 1568 boolean matches( char expected ); 1569 1570 /** 1571 * Determine if the token matches the supplied type. 1572 * 1573 * @param expectedType the expected integer type 1574 * @return true if the token's value matches the supplied integer type, or false otherwise 1575 */ 1576 boolean matches( int expectedType ); 1577 1578 /** 1579 * Get the type of the token. 1580 * 1581 * @return the token's type 1582 */ 1583 int type(); 1584 1585 /** 1586 * Get the index in the raw stream for the first character in the token. 1587 * 1588 * @return the starting index of the token 1589 */ 1590 int startIndex(); 1591 1592 /** 1593 * Get the index in the raw stream past the last character in the token. 1594 * 1595 * @return the ending index of the token, which is past the last character 1596 */ 1597 int endIndex(); 1598 1599 /** 1600 * Get the length of the token, which is equivalent to <code>endIndex() - startIndex()</code>. 1601 * 1602 * @return the length 1603 */ 1604 int length(); 1605 1606 /** 1607 * Get the position of this token, which includes the line number and column number of the first character in the token. 1608 * 1609 * @return the position; never null 1610 */ 1611 Position position(); 1612 1613 /** 1614 * Bitmask ORed with existing type value. 1615 * 1616 * @param typeMask 1617 * @return copy of Token with new type 1618 */ 1619 Token withType( int typeMask ); 1620 } 1621 1622 /** 1623 * An immutable {@link Token} that implements matching using case-sensitive logic. 1624 */ 1625 @Immutable 1626 protected class CaseSensitiveToken implements Token { 1627 private final int startIndex; 1628 private final int endIndex; 1629 private final int type; 1630 private final Position position; 1631 1632 public CaseSensitiveToken( int startIndex, 1633 int endIndex, 1634 int type, 1635 Position position ) { 1636 this.startIndex = startIndex; 1637 this.endIndex = endIndex; 1638 this.type = type; 1639 this.position = position; 1640 } 1641 1642 @Override 1643 public Token withType( int typeMask ) { 1644 int type = this.type | typeMask; 1645 return new CaseSensitiveToken(startIndex, endIndex, type, position); 1646 } 1647 1648 @Override 1649 public final int type() { 1650 return type; 1651 } 1652 1653 @Override 1654 public final int startIndex() { 1655 return startIndex; 1656 } 1657 1658 @Override 1659 public final int endIndex() { 1660 return endIndex; 1661 } 1662 1663 @Override 1664 public final int length() { 1665 return endIndex - startIndex; 1666 } 1667 1668 @Override 1669 public final boolean matches( char expected ) { 1670 return length() == 1 && matchString().charAt(startIndex) == expected; 1671 } 1672 1673 @Override 1674 public boolean matches(String expected) { 1675 return matchString().substring(startIndex, endIndex).equals(expected); 1676 } 1677 1678 @Override 1679 public final boolean matches( int expectedType ) { 1680 return expectedType == ANY_TYPE || (currentToken().type() & expectedType) == expectedType; 1681 } 1682 1683 @Override 1684 public final String value() { 1685 return inputString.substring(startIndex, endIndex); 1686 } 1687 1688 @Override 1689 public Position position() { 1690 return position; 1691 } 1692 1693 protected String matchString() { 1694 return inputString; 1695 } 1696 1697 @Override 1698 public String toString() { 1699 return value(); 1700 } 1701 } 1702 1703 @Immutable 1704 protected class CaseInsensitiveToken extends CaseSensitiveToken { 1705 public CaseInsensitiveToken( int startIndex, 1706 int endIndex, 1707 int type, 1708 Position position ) { 1709 super(startIndex, endIndex, type, position); 1710 } 1711 1712 @Override 1713 public boolean matches( String expected ) { 1714 return matchString().substring(startIndex(), endIndex()).toUpperCase(Locale.ROOT).equals(expected); 1715 } 1716 1717 @Override 1718 public Token withType( int typeMask ) { 1719 int type = this.type() | typeMask; 1720 return new CaseInsensitiveToken(startIndex(), endIndex(), type, position()); 1721 } 1722 } 1723 1724 protected abstract class TokenFactory implements Tokens { 1725 protected final List<Token> tokens = new ArrayList<Token>(); 1726 1727 @Override 1728 public void addToken( Position position, 1729 int index ) { 1730 addToken(position, index, index + 1, 0); 1731 } 1732 1733 @Override 1734 public final void addToken( Position position, 1735 int startIndex, 1736 int endIndex ) { 1737 addToken(position, startIndex, endIndex, 0); 1738 } 1739 1740 public List<Token> getTokens() { 1741 return tokens; 1742 } 1743 } 1744 1745 public class CaseSensitiveTokenFactory extends TokenFactory { 1746 @Override 1747 public void addToken( Position position, 1748 int startIndex, 1749 int endIndex, 1750 int type ) { 1751 tokens.add(new CaseSensitiveToken(startIndex, endIndex, type, position)); 1752 } 1753 } 1754 1755 public class CaseInsensitiveTokenFactory extends TokenFactory { 1756 @Override 1757 public void addToken( Position position, 1758 int startIndex, 1759 int endIndex, 1760 int type ) { 1761 tokens.add(new CaseInsensitiveToken(startIndex, endIndex, type, position)); 1762 } 1763 } 1764 1765 /** 1766 * An implementation of {@link CharacterStream} that works with a single character array. 1767 */ 1768 public static final class CharacterArrayStream implements CharacterStream { 1769 private final char[] content; 1770 private int lastIndex = -1; 1771 private final int maxIndex; 1772 private int lineNumber = 1; 1773 private int columnNumber = 0; 1774 private boolean nextCharMayBeLineFeed; 1775 1776 public CharacterArrayStream( char[] content ) { 1777 this.content = content; 1778 this.maxIndex = content.length - 1; 1779 } 1780 1781 @Override 1782 public boolean hasNext() { 1783 return lastIndex < maxIndex; 1784 } 1785 1786 @Override 1787 public int index() { 1788 return lastIndex; 1789 } 1790 1791 @Override 1792 public Position position( int startIndex ) { 1793 return new Position(startIndex, lineNumber, columnNumber); 1794 } 1795 1796 @Override 1797 public char next() { 1798 if (lastIndex >= maxIndex) { 1799 throw new NoSuchElementException(); 1800 } 1801 char result = content[++lastIndex]; 1802 ++columnNumber; 1803 if (result == '\r') { 1804 nextCharMayBeLineFeed = true; 1805 ++lineNumber; 1806 columnNumber = 0; 1807 } else if (result == '\n') { 1808 if (!nextCharMayBeLineFeed) ++lineNumber; 1809 columnNumber = 0; 1810 } else if (nextCharMayBeLineFeed) { 1811 nextCharMayBeLineFeed = false; 1812 } 1813 return result; 1814 } 1815 1816 @Override 1817 public boolean isNext( char c ) { 1818 int nextIndex = lastIndex + 1; 1819 return nextIndex <= maxIndex && content[nextIndex] == c; 1820 } 1821 1822 @Override 1823 public boolean isNext( char nextChar1, 1824 char nextChar2 ) { 1825 int nextIndex1 = lastIndex + 1; 1826 int nextIndex2 = lastIndex + 2; 1827 return nextIndex2 <= maxIndex && content[nextIndex1] == nextChar1 && content[nextIndex2] == nextChar2; 1828 } 1829 1830 @Override 1831 public boolean isNext( char nextChar1, 1832 char nextChar2, 1833 char nextChar3 ) { 1834 int nextIndex1 = lastIndex + 1; 1835 int nextIndex2 = lastIndex + 2; 1836 int nextIndex3 = lastIndex + 3; 1837 return nextIndex3 <= maxIndex && content[nextIndex1] == nextChar1 && content[nextIndex2] == nextChar2 1838 && content[nextIndex3] == nextChar3; 1839 } 1840 1841 @Override 1842 public boolean isNextAnyOf( char[] characters ) { 1843 int nextIndex = lastIndex + 1; 1844 if (nextIndex <= maxIndex) { 1845 char nextChar = content[lastIndex + 1]; 1846 for (char c : characters) { 1847 if (c == nextChar) return true; 1848 } 1849 } 1850 return false; 1851 } 1852 1853 @Override 1854 public boolean isNextAnyOf( String characters ) { 1855 int nextIndex = lastIndex + 1; 1856 if (nextIndex <= maxIndex) { 1857 char nextChar = content[lastIndex + 1]; 1858 if (characters.indexOf(nextChar) != -1) return true; 1859 } 1860 return false; 1861 } 1862 1863 @Override 1864 public boolean isNextWhitespace() { 1865 int nextIndex = lastIndex + 1; 1866 return nextIndex <= maxIndex && Character.isWhitespace(content[nextIndex]); 1867 } 1868 1869 @Override 1870 public boolean isNextLetterOrDigit() { 1871 int nextIndex = lastIndex + 1; 1872 return nextIndex <= maxIndex && Character.isLetterOrDigit(content[nextIndex]); 1873 } 1874 1875 @Override 1876 public boolean isNextValidXmlCharacter() { 1877 int nextIndex = lastIndex + 1; 1878 return nextIndex <= maxIndex && XmlCharacters.isValid(content[nextIndex]); 1879 } 1880 1881 @Override 1882 public boolean isNextValidXmlNameCharacter() { 1883 int nextIndex = lastIndex + 1; 1884 return nextIndex <= maxIndex && XmlCharacters.isValidName(content[nextIndex]); 1885 } 1886 1887 @Override 1888 public boolean isNextValidXmlNcNameCharacter() { 1889 int nextIndex = lastIndex + 1; 1890 return nextIndex <= maxIndex && XmlCharacters.isValidNcName(content[nextIndex]); 1891 } 1892 } 1893 1894 /** 1895 * Obtain a basic {@link Tokenizer} implementation that ignores whitespace but includes tokens for individual symbols, the 1896 * period ('.'), single-quoted strings, double-quoted strings, whitespace-delimited words, and optionally comments. 1897 * <p> 1898 * Note that the resulting Tokenizer may not be appropriate in many situations, but is provided merely as a convenience for 1899 * those situations that happen to be able to use it. 1900 * </p> 1901 * 1902 * @param includeComments true if the comments should be retained and be included in the token stream, or false if comments 1903 * should be stripped and not included in the token stream 1904 * @return the tokenizer; never null 1905 */ 1906 public static BasicTokenizer basicTokenizer( boolean includeComments ) { 1907 return new BasicTokenizer(includeComments); 1908 } 1909 1910 /** 1911 * A basic {@link Tokenizer} implementation that ignores whitespace but includes tokens for individual symbols, the period 1912 * ('.'), single-quoted strings, double-quoted strings, whitespace-delimited words, and optionally comments. 1913 * <p> 1914 * Note this Tokenizer may not be appropriate in many situations, but is provided merely as a convenience for those situations 1915 * that happen to be able to use it. 1916 * </p> 1917 */ 1918 public static class BasicTokenizer implements Tokenizer { 1919 /** 1920 * The {@link Token#type() token type} for tokens that represent an unquoted string containing a character sequence made 1921 * up of non-whitespace and non-symbol characters. 1922 */ 1923 public static final int WORD = 1; 1924 /** 1925 * The {@link Token#type() token type} for tokens that consist of an individual "symbol" character. The set of characters 1926 * includes: <code>-(){}*,;+%?$[]!<>|=:</code> 1927 */ 1928 public static final int SYMBOL = 2; 1929 /** 1930 * The {@link Token#type() token type} for tokens that consist of an individual '.' character. 1931 */ 1932 public static final int DECIMAL = 4; 1933 /** 1934 * The {@link Token#type() token type} for tokens that consist of all the characters within single-quotes. Single quote 1935 * characters are included if they are preceded (escaped) by a '\' character. 1936 */ 1937 public static final int SINGLE_QUOTED_STRING = 8; 1938 /** 1939 * The {@link Token#type() token type} for tokens that consist of all the characters within double-quotes. Double quote 1940 * characters are included if they are preceded (escaped) by a '\' character. 1941 */ 1942 public static final int DOUBLE_QUOTED_STRING = 16; 1943 /** 1944 * The {@link Token#type() token type} for tokens that consist of all the characters between "/*" and "*/" or between 1945 * "//" and the next line terminator (e.g., '\n', '\r' or "\r\n"). 1946 */ 1947 public static final int COMMENT = 32; 1948 1949 private final boolean useComments; 1950 1951 protected BasicTokenizer( boolean useComments ) { 1952 this.useComments = useComments; 1953 } 1954 1955 @Override 1956 public void tokenize( CharacterStream input, 1957 Tokens tokens ) throws ParsingException { 1958 while (input.hasNext()) { 1959 char c = input.next(); 1960 switch (c) { 1961 case ' ': 1962 case '\t': 1963 case '\n': 1964 case '\r': 1965 // Just skip these whitespace characters ... 1966 break; 1967 case '-': 1968 case '(': 1969 case ')': 1970 case '{': 1971 case '}': 1972 case '*': 1973 case ',': 1974 case ';': 1975 case '+': 1976 case '%': 1977 case '?': 1978 case '$': 1979 case '[': 1980 case ']': 1981 case '!': 1982 case '<': 1983 case '>': 1984 case '|': 1985 case '=': 1986 case ':': 1987 tokens.addToken(input.position(input.index()), input.index(), input.index() + 1, SYMBOL); 1988 break; 1989 case '.': 1990 tokens.addToken(input.position(input.index()), input.index(), input.index() + 1, DECIMAL); 1991 break; 1992 case '\"': 1993 int startIndex = input.index(); 1994 Position startingPosition = input.position(startIndex); 1995 boolean foundClosingQuote = false; 1996 while (input.hasNext()) { 1997 c = input.next(); 1998 if (c == '\\' && input.isNext('"')) { 1999 c = input.next(); // consume the ' character since it is escaped 2000 } else if (c == '"') { 2001 foundClosingQuote = true; 2002 break; 2003 } 2004 } 2005 if (!foundClosingQuote) { 2006 String msg = CommonI18n.noMatchingDoubleQuoteFound.text(startingPosition.getLine(), 2007 startingPosition.getColumn()); 2008 throw new ParsingException(startingPosition, msg); 2009 } 2010 int endIndex = input.index() + 1; // beyond last character read 2011 tokens.addToken(startingPosition, startIndex, endIndex, DOUBLE_QUOTED_STRING); 2012 break; 2013 case '\'': 2014 startIndex = input.index(); 2015 startingPosition = input.position(startIndex); 2016 foundClosingQuote = false; 2017 while (input.hasNext()) { 2018 c = input.next(); 2019 if (c == '\\' && input.isNext('\'')) { 2020 c = input.next(); // consume the ' character since it is escaped 2021 } else if (c == '\'') { 2022 foundClosingQuote = true; 2023 break; 2024 } 2025 } 2026 if (!foundClosingQuote) { 2027 String msg = CommonI18n.noMatchingSingleQuoteFound.text(startingPosition.getLine(), 2028 startingPosition.getColumn()); 2029 throw new ParsingException(startingPosition, msg); 2030 } 2031 endIndex = input.index() + 1; // beyond last character read 2032 tokens.addToken(startingPosition, startIndex, endIndex, SINGLE_QUOTED_STRING); 2033 break; 2034 case '/': 2035 startIndex = input.index(); 2036 startingPosition = input.position(startIndex); 2037 if (input.isNext('/')) { 2038 // End-of-line comment ... 2039 boolean foundLineTerminator = false; 2040 while (input.hasNext()) { 2041 c = input.next(); 2042 if (c == '\n' || c == '\r') { 2043 foundLineTerminator = true; 2044 break; 2045 } 2046 } 2047 endIndex = input.index(); // the token won't include the '\n' or '\r' character(s) 2048 if (!foundLineTerminator) ++endIndex; // must point beyond last char 2049 if (c == '\r' && input.isNext('\n')) input.next(); 2050 if (useComments) { 2051 tokens.addToken(startingPosition, startIndex, endIndex, COMMENT); 2052 } 2053 } else if (input.isNext('*')) { 2054 // Multi-line comment ... 2055 while (input.hasNext() && !input.isNext('*', '/')) { 2056 c = input.next(); 2057 } 2058 if (input.hasNext()) input.next(); // consume the '*' 2059 if (input.hasNext()) input.next(); // consume the '/' 2060 if (useComments) { 2061 endIndex = input.index() + 1; // the token will include the '/' and '*' characters 2062 tokens.addToken(startingPosition, startIndex, endIndex, COMMENT); 2063 } 2064 } else { 2065 // just a regular slash ... 2066 tokens.addToken(startingPosition, startIndex, startIndex + 1, SYMBOL); 2067 } 2068 break; 2069 default: 2070 startIndex = input.index(); 2071 startingPosition = input.position(startIndex); 2072 // Read until another whitespace/symbol/decimal/slash is found 2073 while (input.hasNext() && !(input.isNextWhitespace() || input.isNextAnyOf("/.-(){}*,;+%?$[]!<>|=:"))) { 2074 c = input.next(); 2075 } 2076 endIndex = input.index() + 1; // beyond last character that was included 2077 tokens.addToken(startingPosition, startIndex, endIndex, WORD); 2078 } 2079 } 2080 } 2081 } 2082}