001/* 002 * ModeShape (http://www.modeshape.org) 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package org.modeshape.sequencer.ddl; 017 018import java.util.ArrayList; 019import java.util.Arrays; 020import java.util.HashSet; 021import java.util.List; 022import java.util.Set; 023import org.modeshape.common.CommonI18n; 024import org.modeshape.common.text.ParsingException; 025import org.modeshape.common.text.Position; 026import org.modeshape.common.text.TokenStream; 027 028/** 029 * A TokenStream implementation designed around requirements for tokenizing and parsing DDL statements. 030 * <p> 031 * Because of the complexity of DDL, it was necessary to extend {@link TokenStream} in order to override the basic tokenizer to 032 * tokenize the in-line comments prefixed with "--". In addition, because there is not a default ddl command (or statement) 033 * terminator, an override method was added to {@link TokenStream} to allow re-tokenizing the initial tokens to re-type the 034 * tokens, remove tokens, or any other operation to simplify parsing. 035 * </p> 036 * <p> 037 * In this case, both reserved words (or key words) and statement start phrases can be registered prior to the {@link TokenStream} 038 * 's start() method. Any resulting tokens that match the registered string values will be re-typed to identify them as key words 039 * (DdlTokenizer.KEYWORD) or statement start phrases (DdlTokenizer.STATEMENT_KEY). 040 * </p> 041 */ 042public class DdlTokenStream extends TokenStream { 043 044 protected List<String[]> registeredStatementStartPhrases = new ArrayList<String[]>(); 045 046 protected Set<String> registeredKeyWords = new HashSet<String>(); 047 048 private Position currentMarkedPosition = Position.EMPTY_CONTENT_POSITION; 049 050 /** 051 * {@inheritDoc} 052 * 053 * @see org.modeshape.common.text.TokenStream#initializeTokens(java.util.List) 054 */ 055 @Override 056 protected List<Token> initializeTokens( List<Token> tokens ) { 057 // THIS IS WHERE WE DO THE WORK OF PRE-PARSING TOKENS AND REPLACING KEYWORDS AND STATEMENT STARTS WITH 058 // APPLICABLE TOKEN TYPE BITMASK VALUES 059 // MyClass[] array = (MyClass[])list.toArray(new MyClass[list.size()]); 060 061 Token[] tokensArray = tokens.toArray(new Token[tokens.size()]); 062 List<Token> reTypedTokens = new ArrayList<Token>(tokens.size()); 063 064 for (int i = 0; i < tokensArray.length; i++) { 065 boolean isStatementStart = false; 066 if (isKeyWord(tokensArray[i].value())) { 067 Token retypedToken = tokensArray[i].withType(DdlTokenizer.KEYWORD); 068 // Now we check to see if this keyword begins a registered statement start 069 070 // Keep track of token increment (# of tokens for a phrase) 071 // Need to increment iterator (i) in case phrases like "ALTER ROLLBACK" appear. ROLLBACK is also a statement 072 // start phrase and we need to walk ignore ROLLBACK in this case. 073 int tokenIncrement = 0; 074 for (String[] nextStmtStart : registeredStatementStartPhrases) { 075 boolean matches = true; 076 077 for (int j = 0; j < nextStmtStart.length; j++) { 078 if (matches) { 079 matches = nextStmtStart[j].equalsIgnoreCase(tokensArray[i + j].value()) 080 || nextStmtStart[j].equals(ANY_VALUE); 081 } 082 } 083 if (matches) { 084 isStatementStart = true; 085 tokenIncrement = nextStmtStart.length - 1; 086 break; 087 } 088 } 089 if (isStatementStart) { 090 retypedToken = retypedToken.withType(DdlTokenizer.STATEMENT_KEY); 091 } 092 reTypedTokens.add(retypedToken); 093 094 if (isStatementStart) { 095 // Copy any additional tokens used in the phrase 096 for (int k = 0; k < tokenIncrement; k++) { 097 i++; 098 reTypedTokens.add(tokensArray[i]); 099 } 100 } 101 } else { 102 reTypedTokens.add(tokensArray[i]); 103 } 104 105 } 106 107 return reTypedTokens; 108 } 109 110 /** 111 * @param content 112 * @param tokenizer 113 * @param caseSensitive 114 */ 115 public DdlTokenStream( String content, 116 Tokenizer tokenizer, 117 boolean caseSensitive ) { 118 super(content, tokenizer, caseSensitive); 119 } 120 121 /** 122 * Register a phrase representing the start of a DDL statement 123 * <p> 124 * Examples would be: {"CREATE", "TABLE"} {"CREATE", "OR", "REPLACE", "VIEW"} 125 * </p> 126 * see {@link DdlConstants} for the default SQL 92 representations. 127 * 128 * @param phrase 129 */ 130 public void registerStatementStartPhrase( String[] phrase ) { 131 registeredStatementStartPhrases.add(phrase); 132 } 133 134 public void registerStatementStartPhrase( String[][] phrases ) { 135 for (String[] phrase : phrases) { 136 registeredStatementStartPhrases.add(phrase); 137 } 138 } 139 140 /** 141 * Register a single key word. 142 * 143 * @param keyWord 144 */ 145 public void registerKeyWord( String keyWord ) { 146 registeredKeyWords.add(keyWord); 147 } 148 149 /** 150 * Register an {@link List} of key words. 151 * 152 * @param keyWords 153 */ 154 public void registerKeyWords( List<String> keyWords ) { 155 registeredKeyWords.addAll(keyWords); 156 } 157 158 /** 159 * Register an array of key words. 160 * 161 * @param keyWords 162 */ 163 public void registerKeyWords( String[] keyWords ) { 164 registeredKeyWords.addAll(Arrays.asList(keyWords)); 165 } 166 167 /** 168 * @param word 169 * @return is Key Word 170 */ 171 protected boolean isKeyWord( String word ) { 172 return registeredKeyWords.contains(word.toUpperCase()); 173 } 174 175 /** 176 * Method to determine if the next token is of type {@link DdlTokenizer} KEYWORD. 177 * 178 * @return is Key Word 179 */ 180 public boolean isNextKeyWord() { 181 return this.matches(DdlTokenizer.KEYWORD); 182 } 183 184 /** 185 * Method to determine if next tokens match a registered statement start phrase. 186 * 187 * @return number of keywords in matched registered statement start phrase or zero if not matched 188 */ 189 public int computeNextStatementStartKeywordCount() { 190 int result = 0; 191 192 if (isNextKeyWord()) { 193 for (String[] nextStmtStart : registeredStatementStartPhrases) { 194 if (this.matches(nextStmtStart)) { 195 return nextStmtStart.length; 196 } 197 } 198 } 199 200 return result; 201 } 202 203 /** 204 * Marks the current position (line & column number) of the currentToken 205 */ 206 public void mark() { 207 if (this.hasNext()) { 208 currentMarkedPosition = this.nextPosition(); 209 } else { 210 currentMarkedPosition = null; 211 } 212 213 } 214 215 /** 216 * Returns the string content for characters bounded by the previous marked position and the position of the currentToken 217 * (inclusive). Method also marks() the new position the the currentToken. 218 * 219 * @return the string content for characters bounded by the previous marked position and the position of the currentToken 220 * (inclusive). 221 */ 222 public String getMarkedContent() { 223 Position startPosition = new Position(currentMarkedPosition.getIndexInContent(), currentMarkedPosition.getLine(), 224 currentMarkedPosition.getColumn()); 225 226 mark(); 227 228 return getContentBetween(startPosition, currentMarkedPosition); 229 } 230 231 /** 232 * Obtain a ddl {@link DdlTokenizer} implementation that ignores whitespace but includes tokens for individual symbols, the 233 * period ('.'), single-quoted strings, double-quoted strings, whitespace-delimited words, and optionally comments. 234 * <p> 235 * Note that the resulting Tokenizer may not be appropriate in many situations, but is provided merely as a convenience for 236 * those situations that happen to be able to use it. 237 * </p> 238 * 239 * @param includeComments true if the comments should be retained and be included in the token stream, or false if comments 240 * should be stripped and not included in the token stream 241 * @return the tokenizer; never null 242 */ 243 public static DdlTokenizer ddlTokenizer( boolean includeComments ) { 244 return new DdlTokenizer(includeComments); 245 } 246 247 public static class DdlTokenizer implements Tokenizer { 248 public static final String PARSER_ID = "PARSER_ID"; 249 250 /** 251 * The {@link org.modeshape.common.text.TokenStream.Token#type() token type} for tokens that represent an unquoted string 252 * containing a character sequence made up of non-whitespace and non-symbol characters. 253 */ 254 public static final int WORD = 1; 255 /** 256 * The {@link org.modeshape.common.text.TokenStream.Token#type() token type} for tokens that consist of an individual 257 * "symbol" character. The set of characters includes: <code>-(){}*,;+%?$[]!<>|=:</code> 258 */ 259 public static final int SYMBOL = 2; 260 /** 261 * The {@link org.modeshape.common.text.TokenStream.Token#type() token type} for tokens that consist of an individual '.' 262 * character. 263 */ 264 public static final int DECIMAL = 4; 265 /** 266 * The {@link org.modeshape.common.text.TokenStream.Token#type() token type} for tokens that consist of all the characters 267 * within single-quotes. Single quote characters are included if they are preceded (escaped) by a '\' character. 268 */ 269 public static final int SINGLE_QUOTED_STRING = 8; 270 /** 271 * The {@link org.modeshape.common.text.TokenStream.Token#type() token type} for tokens that consist of all the characters 272 * within double-quotes. Double quote characters are included if they are preceded (escaped) by a '\' character. 273 */ 274 public static final int DOUBLE_QUOTED_STRING = 16; 275 /** 276 * The {@link org.modeshape.common.text.TokenStream.Token#type() token type} for tokens that consist of all the characters 277 * between "/*" and "*/", between "//" and the next line terminator (e.g., '\n', '\r' or "\r\n"), or between "--" and 278 * the next line terminator (e.g., '\n', '\r' or "\r\n"). 279 */ 280 public static final int COMMENT = 32; 281 282 private final boolean useComments; 283 284 /** 285 * The {@link org.modeshape.common.text.TokenStream.Token#type() token type} for tokens that represent key words or 286 * reserved words for a given DDL dialect. 287 * <p> 288 * Examples would be: "CREATE", "TABLE", "ALTER", "SCHEMA", "DROP", etc... 289 * </p> 290 * see {@link DdlConstants} for the default SQL 92 representations. 291 */ 292 public static final int KEYWORD = 64; 293 294 /** 295 * The {@link org.modeshape.common.text.TokenStream.Token#type() token type} for tokens that represent the start of a DDL 296 * statement. 297 * <p> 298 * Examples would be: {"CREATE", "TABLE"} {"CREATE", "OR", "REPLACE", "VIEW"} 299 * </p> 300 * see {@link DdlConstants} for the default SQL 92 representations. 301 */ 302 public static final int STATEMENT_KEY = 128; 303 304 public DdlTokenizer( boolean useComments ) { 305 this.useComments = useComments; 306 } 307 308 /** 309 * @return useComments 310 */ 311 public boolean includeComments() { 312 return useComments; 313 } 314 315 /** 316 * {@inheritDoc} 317 * 318 * @see org.modeshape.common.text.TokenStream.Tokenizer#tokenize(TokenStream.CharacterStream, TokenStream.Tokens) 319 */ 320 @Override 321 public void tokenize( CharacterStream input, 322 Tokens tokens ) throws ParsingException { 323 int startIndex; 324 int endIndex; 325 while (input.hasNext()) { 326 char c = input.next(); 327 switch (c) { 328 case ' ': 329 case '\t': 330 case '\n': 331 case '\r': 332 // Just skip these whitespace characters ... 333 break; 334 // ============================================================================================== 335 // DDL Comments token = "--" 336 // ============================================================================================== 337 case '-': { 338 startIndex = input.index(); 339 Position startPosition = input.position(startIndex); 340 if (input.isNext('-')) { 341 // -- END OF LINE comment ... 342 boolean foundLineTerminator = false; 343 while (input.hasNext()) { 344 c = input.next(); 345 if (c == '\n' || c == '\r') { 346 foundLineTerminator = true; 347 break; 348 } 349 } 350 endIndex = input.index(); // the token won't include the '\n' or '\r' character(s) 351 if (!foundLineTerminator) ++endIndex; // must point beyond last char 352 if (c == '\r' && input.isNext('\n')) input.next(); 353 354 // Check for PARSER_ID 355 356 if (useComments) { 357 tokens.addToken(startPosition, startIndex, endIndex, COMMENT); 358 } 359 360 } else { 361 // just a regular dash ... 362 tokens.addToken(startPosition, startIndex, startIndex + 1, SYMBOL); 363 } 364 break; 365 } 366 // ============================================================================================== 367 case '(': 368 case ')': 369 case '{': 370 case '}': 371 case '*': 372 case ',': 373 case ';': 374 case '+': 375 case '%': 376 case '?': 377 case '[': 378 case ']': 379 case '!': 380 case '<': 381 case '>': 382 case '|': 383 case '=': 384 case ':': 385 tokens.addToken(input.position(input.index()), input.index(), input.index() + 1, SYMBOL); 386 break; 387 case '.': 388 tokens.addToken(input.position(input.index()), input.index(), input.index() + 1, DECIMAL); 389 break; 390 case '\"': 391 startIndex = input.index(); 392 Position startingPosition = input.position(startIndex); 393 boolean foundClosingQuote = false; 394 while (input.hasNext()) { 395 c = input.next(); 396 if ((c == '\\' || c == '"') && input.isNext('"')) { 397 c = input.next(); // consume the ' character since it is escaped 398 } else if (c == '"') { 399 foundClosingQuote = true; 400 break; 401 } 402 } 403 if (!foundClosingQuote) { 404 String msg = CommonI18n.noMatchingDoubleQuoteFound.text(startingPosition.getLine(), 405 startingPosition.getColumn()); 406 throw new ParsingException(startingPosition, msg); 407 } 408 endIndex = input.index() + 1; // beyond last character read 409 tokens.addToken(startingPosition, startIndex, endIndex, DOUBLE_QUOTED_STRING); 410 break; 411 case '\u2019': // '’': 412 case '\'': 413 char quoteChar = c; 414 startIndex = input.index(); 415 startingPosition = input.position(startIndex); 416 foundClosingQuote = false; 417 while (input.hasNext()) { 418 c = input.next(); 419 if ((c == '\\' || c == quoteChar) && input.isNext(quoteChar)) { 420 c = input.next(); // consume the ' character since it is escaped 421 } else if (c == quoteChar) { 422 foundClosingQuote = true; 423 break; 424 } 425 } 426 if (!foundClosingQuote) { 427 String msg = CommonI18n.noMatchingSingleQuoteFound.text(startingPosition.getLine(), 428 startingPosition.getColumn()); 429 throw new ParsingException(startingPosition, msg); 430 } 431 endIndex = input.index() + 1; // beyond last character read 432 tokens.addToken(startingPosition, startIndex, endIndex, SINGLE_QUOTED_STRING); 433 break; 434 case '/': 435 startIndex = input.index(); 436 startingPosition = input.position(startIndex); 437 if (input.isNext('/')) { 438 // End-of-line comment ... 439 boolean foundLineTerminator = false; 440 while (input.hasNext()) { 441 c = input.next(); 442 if (c == '\n' || c == '\r') { 443 foundLineTerminator = true; 444 break; 445 } 446 } 447 endIndex = input.index(); // the token won't include the '\n' or '\r' character(s) 448 if (!foundLineTerminator) ++endIndex; // must point beyond last char 449 if (c == '\r' && input.isNext('\n')) input.next(); 450 if (useComments) { 451 tokens.addToken(startingPosition, startIndex, endIndex, COMMENT); 452 } 453 454 } else if (input.isNext('*')) { 455 // Multi-line comment ... 456 while (input.hasNext() && !input.isNext('*', '/')) { 457 c = input.next(); 458 } 459 if (input.hasNext()) input.next(); // consume the '*' 460 if (input.hasNext()) input.next(); // consume the '/' 461 462 endIndex = input.index() + 1; // the token will include the '/' and '*' characters 463 if (useComments) { 464 tokens.addToken(startingPosition, startIndex, endIndex, COMMENT); 465 } 466 467 } else { 468 // just a regular slash ... 469 tokens.addToken(startingPosition, startIndex, startIndex + 1, SYMBOL); 470 } 471 break; 472 default: 473 startIndex = input.index(); 474 Position startPosition = input.position(startIndex); 475 // Read until another whitespace/symbol/decimal/slash is found 476 while (input.hasNext() && !(input.isNextWhitespace() || input.isNextAnyOf("/.-(){}*,;+%?[]!<>|=:"))) { 477 c = input.next(); 478 } 479 endIndex = input.index() + 1; // beyond last character that was included 480 tokens.addToken(startPosition, startIndex, endIndex, WORD); 481 } 482 } 483 } 484 } 485}