001/*
002 * ModeShape (http://www.modeshape.org)
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *       http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016package org.modeshape.sequencer.ddl;
017
018import java.util.ArrayList;
019import java.util.Arrays;
020import java.util.HashSet;
021import java.util.List;
022import java.util.Set;
023import org.modeshape.common.CommonI18n;
024import org.modeshape.common.text.ParsingException;
025import org.modeshape.common.text.Position;
026import org.modeshape.common.text.TokenStream;
027
028/**
029 * A TokenStream implementation designed around requirements for tokenizing and parsing DDL statements.
030 * <p>
031 * Because of the complexity of DDL, it was necessary to extend {@link TokenStream} in order to override the basic tokenizer to
032 * tokenize the in-line comments prefixed with "--". In addition, because there is not a default ddl command (or statement)
033 * terminator, an override method was added to {@link TokenStream} to allow re-tokenizing the initial tokens to re-type the
034 * tokens, remove tokens, or any other operation to simplify parsing.
035 * </p>
036 * <p>
037 * In this case, both reserved words (or key words) and statement start phrases can be registered prior to the {@link TokenStream}
038 * 's start() method. Any resulting tokens that match the registered string values will be re-typed to identify them as key words
039 * (DdlTokenizer.KEYWORD) or statement start phrases (DdlTokenizer.STATEMENT_KEY).
040 * </p>
041 */
042public class DdlTokenStream extends TokenStream {
043
044    protected List<String[]> registeredStatementStartPhrases = new ArrayList<String[]>();
045
046    protected Set<String> registeredKeyWords = new HashSet<String>();
047
048    private Position currentMarkedPosition = Position.EMPTY_CONTENT_POSITION;
049
050    /**
051     * {@inheritDoc}
052     * 
053     * @see org.modeshape.common.text.TokenStream#initializeTokens(java.util.List)
054     */
055    @Override
056    protected List<Token> initializeTokens( List<Token> tokens ) {
057        // THIS IS WHERE WE DO THE WORK OF PRE-PARSING TOKENS AND REPLACING KEYWORDS AND STATEMENT STARTS WITH
058        // APPLICABLE TOKEN TYPE BITMASK VALUES
059        // MyClass[] array = (MyClass[])list.toArray(new MyClass[list.size()]);
060
061        Token[] tokensArray = tokens.toArray(new Token[tokens.size()]);
062        List<Token> reTypedTokens = new ArrayList<Token>(tokens.size());
063
064        for (int i = 0; i < tokensArray.length; i++) {
065            boolean isStatementStart = false;
066            if (isKeyWord(tokensArray[i].value())) {
067                Token retypedToken = tokensArray[i].withType(DdlTokenizer.KEYWORD);
068                // Now we check to see if this keyword begins a registered statement start
069
070                // Keep track of token increment (# of tokens for a phrase)
071                // Need to increment iterator (i) in case phrases like "ALTER ROLLBACK" appear. ROLLBACK is also a statement
072                // start phrase and we need to walk ignore ROLLBACK in this case.
073                int tokenIncrement = 0;
074                for (String[] nextStmtStart : registeredStatementStartPhrases) {
075                    boolean matches = true;
076
077                    for (int j = 0; j < nextStmtStart.length; j++) {
078                        if (matches) {
079                            matches = nextStmtStart[j].equalsIgnoreCase(tokensArray[i + j].value())
080                                      || nextStmtStart[j].equals(ANY_VALUE);
081                        }
082                    }
083                    if (matches) {
084                        isStatementStart = true;
085                        tokenIncrement = nextStmtStart.length - 1;
086                        break;
087                    }
088                }
089                if (isStatementStart) {
090                    retypedToken = retypedToken.withType(DdlTokenizer.STATEMENT_KEY);
091                }
092                reTypedTokens.add(retypedToken);
093
094                if (isStatementStart) {
095                    // Copy any additional tokens used in the phrase
096                    for (int k = 0; k < tokenIncrement; k++) {
097                        i++;
098                        reTypedTokens.add(tokensArray[i]);
099                    }
100                }
101            } else {
102                reTypedTokens.add(tokensArray[i]);
103            }
104
105        }
106
107        return reTypedTokens;
108    }
109
110    /**
111     * @param content
112     * @param tokenizer
113     * @param caseSensitive
114     */
115    public DdlTokenStream( String content,
116                           Tokenizer tokenizer,
117                           boolean caseSensitive ) {
118        super(content, tokenizer, caseSensitive);
119    }
120
121    /**
122     * Register a phrase representing the start of a DDL statement
123     * <p>
124     * Examples would be: {"CREATE", "TABLE"} {"CREATE", "OR", "REPLACE", "VIEW"}
125     * </p>
126     * see {@link DdlConstants} for the default SQL 92 representations.
127     * 
128     * @param phrase
129     */
130    public void registerStatementStartPhrase( String[] phrase ) {
131        registeredStatementStartPhrases.add(phrase);
132    }
133
134    public void registerStatementStartPhrase( String[][] phrases ) {
135        for (String[] phrase : phrases) {
136            registeredStatementStartPhrases.add(phrase);
137        }
138    }
139
140    /**
141     * Register a single key word.
142     * 
143     * @param keyWord
144     */
145    public void registerKeyWord( String keyWord ) {
146        registeredKeyWords.add(keyWord);
147    }
148
149    /**
150     * Register an {@link List} of key words.
151     * 
152     * @param keyWords
153     */
154    public void registerKeyWords( List<String> keyWords ) {
155        registeredKeyWords.addAll(keyWords);
156    }
157
158    /**
159     * Register an array of key words.
160     * 
161     * @param keyWords
162     */
163    public void registerKeyWords( String[] keyWords ) {
164        registeredKeyWords.addAll(Arrays.asList(keyWords));
165    }
166
167    /**
168     * @param word
169     * @return is Key Word
170     */
171    protected boolean isKeyWord( String word ) {
172        return registeredKeyWords.contains(word.toUpperCase());
173    }
174
175    /**
176     * Method to determine if the next token is of type {@link DdlTokenizer} KEYWORD.
177     * 
178     * @return is Key Word
179     */
180    public boolean isNextKeyWord() {
181        return this.matches(DdlTokenizer.KEYWORD);
182    }
183
184    /**
185     * Method to determine if next tokens match a registered statement start phrase.
186     * 
187     * @return number of keywords in matched registered statement start phrase or zero if not matched
188     */
189    public int computeNextStatementStartKeywordCount() {
190        int result = 0;
191
192        if (isNextKeyWord()) {
193            for (String[] nextStmtStart : registeredStatementStartPhrases) {
194                if (this.matches(nextStmtStart)) {
195                    return nextStmtStart.length;
196                }
197            }
198        }
199
200        return result;
201    }
202
203    /**
204     * Marks the current position (line & column number) of the currentToken
205     */
206    public void mark() {
207        if (this.hasNext()) {
208            currentMarkedPosition = this.nextPosition();
209        } else {
210            currentMarkedPosition = null;
211        }
212
213    }
214
215    /**
216     * Returns the string content for characters bounded by the previous marked position and the position of the currentToken
217     * (inclusive). Method also marks() the new position the the currentToken.
218     * 
219     * @return the string content for characters bounded by the previous marked position and the position of the currentToken
220     *         (inclusive).
221     */
222    public String getMarkedContent() {
223        Position startPosition = new Position(currentMarkedPosition.getIndexInContent(), currentMarkedPosition.getLine(),
224                                              currentMarkedPosition.getColumn());
225
226        mark();
227
228        return getContentBetween(startPosition, currentMarkedPosition);
229    }
230
231    /**
232     * Obtain a ddl {@link DdlTokenizer} implementation that ignores whitespace but includes tokens for individual symbols, the
233     * period ('.'), single-quoted strings, double-quoted strings, whitespace-delimited words, and optionally comments.
234     * <p>
235     * Note that the resulting Tokenizer may not be appropriate in many situations, but is provided merely as a convenience for
236     * those situations that happen to be able to use it.
237     * </p>
238     * 
239     * @param includeComments true if the comments should be retained and be included in the token stream, or false if comments
240     *        should be stripped and not included in the token stream
241     * @return the tokenizer; never null
242     */
243    public static DdlTokenizer ddlTokenizer( boolean includeComments ) {
244        return new DdlTokenizer(includeComments);
245    }
246
247    public static class DdlTokenizer implements Tokenizer {
248        public static final String PARSER_ID = "PARSER_ID";
249
250        /**
251         * The {@link org.modeshape.common.text.TokenStream.Token#type() token type} for tokens that represent an unquoted string
252         * containing a character sequence made up of non-whitespace and non-symbol characters.
253         */
254        public static final int WORD = 1;
255        /**
256         * The {@link org.modeshape.common.text.TokenStream.Token#type() token type} for tokens that consist of an individual
257         * "symbol" character. The set of characters includes: <code>-(){}*,;+%?$[]!<>|=:</code>
258         */
259        public static final int SYMBOL = 2;
260        /**
261         * The {@link org.modeshape.common.text.TokenStream.Token#type() token type} for tokens that consist of an individual '.'
262         * character.
263         */
264        public static final int DECIMAL = 4;
265        /**
266         * The {@link org.modeshape.common.text.TokenStream.Token#type() token type} for tokens that consist of all the characters
267         * within single-quotes. Single quote characters are included if they are preceded (escaped) by a '\' character.
268         */
269        public static final int SINGLE_QUOTED_STRING = 8;
270        /**
271         * The {@link org.modeshape.common.text.TokenStream.Token#type() token type} for tokens that consist of all the characters
272         * within double-quotes. Double quote characters are included if they are preceded (escaped) by a '\' character.
273         */
274        public static final int DOUBLE_QUOTED_STRING = 16;
275        /**
276         * The {@link org.modeshape.common.text.TokenStream.Token#type() token type} for tokens that consist of all the characters
277         * between "/*" and "&#42;/", between "//" and the next line terminator (e.g., '\n', '\r' or "\r\n"), or between "--" and
278         * the next line terminator (e.g., '\n', '\r' or "\r\n").
279         */
280        public static final int COMMENT = 32;
281
282        private final boolean useComments;
283
284        /**
285         * The {@link org.modeshape.common.text.TokenStream.Token#type() token type} for tokens that represent key words or
286         * reserved words for a given DDL dialect.
287         * <p>
288         * Examples would be: "CREATE", "TABLE", "ALTER", "SCHEMA", "DROP", etc...
289         * </p>
290         * see {@link DdlConstants} for the default SQL 92 representations.
291         */
292        public static final int KEYWORD = 64;
293
294        /**
295         * The {@link org.modeshape.common.text.TokenStream.Token#type() token type} for tokens that represent the start of a DDL
296         * statement.
297         * <p>
298         * Examples would be: {"CREATE", "TABLE"} {"CREATE", "OR", "REPLACE", "VIEW"}
299         * </p>
300         * see {@link DdlConstants} for the default SQL 92 representations.
301         */
302        public static final int STATEMENT_KEY = 128;
303
304        public DdlTokenizer( boolean useComments ) {
305            this.useComments = useComments;
306        }
307
308        /**
309         * @return useComments
310         */
311        public boolean includeComments() {
312            return useComments;
313        }
314
315        /**
316         * {@inheritDoc}
317         * 
318         * @see org.modeshape.common.text.TokenStream.Tokenizer#tokenize(TokenStream.CharacterStream, TokenStream.Tokens)
319         */
320        @Override
321        public void tokenize( CharacterStream input,
322                              Tokens tokens ) throws ParsingException {
323            int startIndex;
324            int endIndex;
325            while (input.hasNext()) {
326                char c = input.next();
327                switch (c) {
328                    case ' ':
329                    case '\t':
330                    case '\n':
331                    case '\r':
332                        // Just skip these whitespace characters ...
333                        break;
334                    // ==============================================================================================
335                    // DDL Comments token = "--"
336                    // ==============================================================================================
337                    case '-': {
338                        startIndex = input.index();
339                        Position startPosition = input.position(startIndex);
340                        if (input.isNext('-')) {
341                            // -- END OF LINE comment ...
342                            boolean foundLineTerminator = false;
343                            while (input.hasNext()) {
344                                c = input.next();
345                                if (c == '\n' || c == '\r') {
346                                    foundLineTerminator = true;
347                                    break;
348                                }
349                            }
350                            endIndex = input.index(); // the token won't include the '\n' or '\r' character(s)
351                            if (!foundLineTerminator) ++endIndex; // must point beyond last char
352                            if (c == '\r' && input.isNext('\n')) input.next();
353
354                            // Check for PARSER_ID
355
356                            if (useComments) {
357                                tokens.addToken(startPosition, startIndex, endIndex, COMMENT);
358                            }
359
360                        } else {
361                            // just a regular dash ...
362                            tokens.addToken(startPosition, startIndex, startIndex + 1, SYMBOL);
363                        }
364                        break;
365                    }
366                    // ==============================================================================================
367                    case '(':
368                    case ')':
369                    case '{':
370                    case '}':
371                    case '*':
372                    case ',':
373                    case ';':
374                    case '+':
375                    case '%':
376                    case '?':
377                    case '[':
378                    case ']':
379                    case '!':
380                    case '<':
381                    case '>':
382                    case '|':
383                    case '=':
384                    case ':':
385                        tokens.addToken(input.position(input.index()), input.index(), input.index() + 1, SYMBOL);
386                        break;
387                    case '.':
388                        tokens.addToken(input.position(input.index()), input.index(), input.index() + 1, DECIMAL);
389                        break;
390                    case '\"':
391                        startIndex = input.index();
392                        Position startingPosition = input.position(startIndex);
393                        boolean foundClosingQuote = false;
394                        while (input.hasNext()) {
395                            c = input.next();
396                            if ((c == '\\' || c == '"') && input.isNext('"')) {
397                                c = input.next(); // consume the ' character since it is escaped
398                            } else if (c == '"') {
399                                foundClosingQuote = true;
400                                break;
401                            }
402                        }
403                        if (!foundClosingQuote) {
404                            String msg = CommonI18n.noMatchingDoubleQuoteFound.text(startingPosition.getLine(),
405                                                                                    startingPosition.getColumn());
406                            throw new ParsingException(startingPosition, msg);
407                        }
408                        endIndex = input.index() + 1; // beyond last character read
409                        tokens.addToken(startingPosition, startIndex, endIndex, DOUBLE_QUOTED_STRING);
410                        break;
411                    case '\u2019': // '’':
412                    case '\'':
413                        char quoteChar = c;
414                        startIndex = input.index();
415                        startingPosition = input.position(startIndex);
416                        foundClosingQuote = false;
417                        while (input.hasNext()) {
418                            c = input.next();
419                            if ((c == '\\' || c == quoteChar) && input.isNext(quoteChar)) {
420                                c = input.next(); // consume the ' character since it is escaped
421                            } else if (c == quoteChar) {
422                                foundClosingQuote = true;
423                                break;
424                            }
425                        }
426                        if (!foundClosingQuote) {
427                            String msg = CommonI18n.noMatchingSingleQuoteFound.text(startingPosition.getLine(),
428                                                                                    startingPosition.getColumn());
429                            throw new ParsingException(startingPosition, msg);
430                        }
431                        endIndex = input.index() + 1; // beyond last character read
432                        tokens.addToken(startingPosition, startIndex, endIndex, SINGLE_QUOTED_STRING);
433                        break;
434                    case '/':
435                        startIndex = input.index();
436                        startingPosition = input.position(startIndex);
437                        if (input.isNext('/')) {
438                            // End-of-line comment ...
439                            boolean foundLineTerminator = false;
440                            while (input.hasNext()) {
441                                c = input.next();
442                                if (c == '\n' || c == '\r') {
443                                    foundLineTerminator = true;
444                                    break;
445                                }
446                            }
447                            endIndex = input.index(); // the token won't include the '\n' or '\r' character(s)
448                            if (!foundLineTerminator) ++endIndex; // must point beyond last char
449                            if (c == '\r' && input.isNext('\n')) input.next();
450                            if (useComments) {
451                                tokens.addToken(startingPosition, startIndex, endIndex, COMMENT);
452                            }
453
454                        } else if (input.isNext('*')) {
455                            // Multi-line comment ...
456                            while (input.hasNext() && !input.isNext('*', '/')) {
457                                c = input.next();
458                            }
459                            if (input.hasNext()) input.next(); // consume the '*'
460                            if (input.hasNext()) input.next(); // consume the '/'
461
462                            endIndex = input.index() + 1; // the token will include the '/' and '*' characters
463                            if (useComments) {
464                                tokens.addToken(startingPosition, startIndex, endIndex, COMMENT);
465                            }
466
467                        } else {
468                            // just a regular slash ...
469                            tokens.addToken(startingPosition, startIndex, startIndex + 1, SYMBOL);
470                        }
471                        break;
472                    default:
473                        startIndex = input.index();
474                        Position startPosition = input.position(startIndex);
475                        // Read until another whitespace/symbol/decimal/slash is found
476                        while (input.hasNext() && !(input.isNextWhitespace() || input.isNextAnyOf("/.-(){}*,;+%?[]!<>|=:"))) {
477                            c = input.next();
478                        }
479                        endIndex = input.index() + 1; // beyond last character that was included
480                        tokens.addToken(startPosition, startIndex, endIndex, WORD);
481                }
482            }
483        }
484    }
485}