001/*
002 * ModeShape (http://www.modeshape.org)
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *       http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016package org.modeshape.common.text;
017
018import static org.hamcrest.core.Is.is;
019import static org.junit.Assert.assertThat;
020import java.util.LinkedList;
021import org.modeshape.common.text.TokenStream.BasicTokenizer;
022import org.modeshape.common.text.TokenStream.CharacterArrayStream;
023import org.modeshape.common.text.TokenStream.Tokens;
024import org.junit.Before;
025import org.junit.Test;
026
027/**
028 * 
029 */
030public class TokenStreamBasicTokenizerTest {
031
032    private BasicTokenizer tokenizer;
033    private Tokens tokenFactory;
034    private LinkedList<int[]> tokenValues;
035
036    @Before
037    public void beforeEach() {
038        tokenizer = TokenStream.basicTokenizer(true);
039        final LinkedList<int[]> tokenValues = new LinkedList<int[]>();
040        tokenFactory = new Tokens() {
041            @Override
042            public void addToken( Position position,
043                                  int index ) {
044                int[] token = new int[] {index, index + 1, 0};
045                tokenValues.add(token);
046            }
047
048            @Override
049            public void addToken( Position position,
050                                  int startIndex,
051                                  int endIndex ) {
052                int[] token = new int[] {startIndex, endIndex, 0};
053                tokenValues.add(token);
054            }
055
056            @Override
057            public void addToken( Position position,
058                                  int startIndex,
059                                  int endIndex,
060                                  int type ) {
061                int[] token = new int[] {startIndex, endIndex, type};
062                tokenValues.add(token);
063            }
064        };
065        this.tokenValues = tokenValues;
066    }
067
068    protected void tokenize( String input ) {
069        tokenizer.tokenize(new CharacterArrayStream(input.toCharArray()), tokenFactory);
070    }
071
072    protected void assertNextTokenIs( int startIndex,
073                                      int endIndex,
074                                      int type ) {
075        int[] token = tokenValues.removeFirst();
076        assertThat(token[0], is(startIndex));
077        assertThat(token[1], is(endIndex));
078        assertThat(token[2], is(type));
079    }
080
081    protected void assertNoMoreTokens() {
082        assertThat(tokenValues.isEmpty(), is(true));
083    }
084
085    @Test
086    public void shouldCreateNoTokensForEmptyContent() {
087        tokenize("");
088        assertNoMoreTokens();
089    }
090
091    @Test
092    public void shouldCreateNoTokensForContentWithOnlyWhitespace() {
093        tokenize("  \t   \n   \r\n  \r  ");
094        assertNoMoreTokens();
095    }
096
097    @Test
098    public void shouldCreateTokenForEachSymbolCharacter() {
099        String content = "-(){}*,;+%?$[]!<>|=:";
100        int numSymbols = content.length();
101        tokenize(content);
102        for (int i = 0; i != numSymbols; ++i) {
103            assertNextTokenIs(i, i + 1, BasicTokenizer.SYMBOL);
104        }
105        assertNoMoreTokens();
106    }
107
108    @Test
109    public void shouldCreateTokenForEachDecimalCharacter() {
110        tokenize(".");
111        assertNextTokenIs(0, 1, BasicTokenizer.DECIMAL);
112        assertNoMoreTokens();
113    }
114
115    @Test
116    public void shouldCreateTokenForEndOfLineComment() {
117        String content = "--//this is a comment\n";
118        tokenize(content);
119        assertNextTokenIs(0, 1, BasicTokenizer.SYMBOL);
120        assertNextTokenIs(1, 2, BasicTokenizer.SYMBOL);
121        assertNextTokenIs(2, content.length() - 1, BasicTokenizer.COMMENT); // -1 because '\n' is not included
122        assertNoMoreTokens();
123    }
124
125    @Test
126    public void shouldCreateTokenForEndOfLineCommentThatEndsWithEndOfString() {
127        String content = "--//this is a comment";
128        tokenize(content);
129        assertNextTokenIs(0, 1, BasicTokenizer.SYMBOL);
130        assertNextTokenIs(1, 2, BasicTokenizer.SYMBOL);
131        assertNextTokenIs(2, content.length(), BasicTokenizer.COMMENT);
132        assertNoMoreTokens();
133    }
134
135    @Test
136    public void shouldCreateTokenForMultiLineComment() {
137        String content = "--/*this is a comment*/-";
138        tokenize(content);
139        assertNextTokenIs(0, 1, BasicTokenizer.SYMBOL);
140        assertNextTokenIs(1, 2, BasicTokenizer.SYMBOL);
141        assertNextTokenIs(2, content.length() - 1, BasicTokenizer.COMMENT);
142        assertNextTokenIs(content.length() - 1, content.length(), BasicTokenizer.SYMBOL);
143        assertNoMoreTokens();
144    }
145
146    @Test
147    public void shouldCreateTokenForMultiLineCommentAtEndOfContent() {
148        String content = "--/*this is a comment*/";
149        tokenize(content);
150        assertNextTokenIs(0, 1, BasicTokenizer.SYMBOL);
151        assertNextTokenIs(1, 2, BasicTokenizer.SYMBOL);
152        assertNextTokenIs(2, content.length(), BasicTokenizer.COMMENT);
153        assertNoMoreTokens();
154    }
155
156    @Test
157    public void shouldCreateTokenForMultiLineCommentWithoutTerminatingCharacters() {
158        String content = "--/*this is a comment";
159        tokenize(content);
160        assertNextTokenIs(0, 1, BasicTokenizer.SYMBOL);
161        assertNextTokenIs(1, 2, BasicTokenizer.SYMBOL);
162        assertNextTokenIs(2, content.length(), BasicTokenizer.COMMENT);
163        assertNoMoreTokens();
164    }
165
166    @Test
167    public void shouldCreateTokenForMultiLineCommentWithoutAllTerminatingCharacters() {
168        String content = "--/*this is a comment*";
169        tokenize(content);
170        assertNextTokenIs(0, 1, BasicTokenizer.SYMBOL);
171        assertNextTokenIs(1, 2, BasicTokenizer.SYMBOL);
172        assertNextTokenIs(2, content.length(), BasicTokenizer.COMMENT);
173        assertNoMoreTokens();
174    }
175
176    @Test
177    public void shouldCreateTokenForSingleQuotedString() {
178        String content = "--'this is a single-quoted \n string'-";
179        assertThat(content.charAt(2), is('\''));
180        assertThat(content.charAt(35), is('\''));
181        tokenize(content);
182        assertNextTokenIs(0, 1, BasicTokenizer.SYMBOL);
183        assertNextTokenIs(1, 2, BasicTokenizer.SYMBOL);
184        assertNextTokenIs(2, 36, BasicTokenizer.SINGLE_QUOTED_STRING);
185        assertNextTokenIs(36, 37, BasicTokenizer.SYMBOL);
186        assertNoMoreTokens();
187    }
188
189    @Test
190    public void shouldCreateTokenForSingleQuotedStringWithEscapedSingleQuoteCharacters() {
191        String content = "--'this \"is\" a \\'single-quoted\\' \n string'-";
192        assertThat(content.charAt(2), is('\''));
193        assertThat(content.charAt(41), is('\''));
194        tokenize(content);
195        assertNextTokenIs(0, 1, BasicTokenizer.SYMBOL);
196        assertNextTokenIs(1, 2, BasicTokenizer.SYMBOL);
197        assertNextTokenIs(2, 42, BasicTokenizer.SINGLE_QUOTED_STRING);
198        assertNextTokenIs(42, 43, BasicTokenizer.SYMBOL);
199        assertNoMoreTokens();
200    }
201
202    @Test
203    public void shouldCreateTokenForSingleQuotedStringAtEndOfContent() {
204        String content = "--'this is a single-quoted \n string'";
205        assertThat(content.charAt(2), is('\''));
206        assertThat(content.charAt(35), is('\''));
207        tokenize(content);
208        assertNextTokenIs(0, 1, BasicTokenizer.SYMBOL);
209        assertNextTokenIs(1, 2, BasicTokenizer.SYMBOL);
210        assertNextTokenIs(2, 36, BasicTokenizer.SINGLE_QUOTED_STRING);
211        assertNoMoreTokens();
212    }
213
214    @Test( expected = ParsingException.class )
215    public void shouldCreateTokenForSingleQuotedStringWithoutClosingQuote() {
216        String content = "--'this is a single-quoted \n string";
217        tokenize(content);
218    }
219
220    @Test
221    public void shouldCreateTokenForDoubleQuotedString() {
222        String content = "--\"this is a double-quoted \n string\"-";
223        assertThat(content.charAt(2), is('"'));
224        assertThat(content.charAt(35), is('"'));
225        tokenize(content);
226        assertNextTokenIs(0, 1, BasicTokenizer.SYMBOL);
227        assertNextTokenIs(1, 2, BasicTokenizer.SYMBOL);
228        assertNextTokenIs(2, 36, BasicTokenizer.DOUBLE_QUOTED_STRING);
229        assertNextTokenIs(36, 37, BasicTokenizer.SYMBOL);
230        assertNoMoreTokens();
231    }
232
233    @Test
234    public void shouldCreateTokenForDoubleQuotedStringWithEscapedDoubleQuoteCharacters() {
235        String content = "--\"this 'is' a \\\"double-quoted\\\" \n string\"-";
236        assertThat(content.charAt(2), is('"'));
237        assertThat(content.charAt(41), is('"'));
238        tokenize(content);
239        assertNextTokenIs(0, 1, BasicTokenizer.SYMBOL);
240        assertNextTokenIs(1, 2, BasicTokenizer.SYMBOL);
241        assertNextTokenIs(2, 42, BasicTokenizer.DOUBLE_QUOTED_STRING);
242        assertNextTokenIs(42, 43, BasicTokenizer.SYMBOL);
243        assertNoMoreTokens();
244    }
245
246    @Test
247    public void shouldCreateTokenForDoubleQuotedStringAtEndOfContent() {
248        String content = "--\"this is a double-quoted \n string\"";
249        assertThat(content.charAt(2), is('"'));
250        assertThat(content.charAt(35), is('"'));
251        tokenize(content);
252        assertNextTokenIs(0, 1, BasicTokenizer.SYMBOL);
253        assertNextTokenIs(1, 2, BasicTokenizer.SYMBOL);
254        assertNextTokenIs(2, 36, BasicTokenizer.DOUBLE_QUOTED_STRING);
255        assertNoMoreTokens();
256    }
257
258    @Test( expected = ParsingException.class )
259    public void shouldCreateTokenForDoubleQuotedStringWithoutClosingQuote() {
260        String content = "--\"this is a double-quoted \n string";
261        tokenize(content);
262    }
263
264    @Test
265    public void shouldCreateTokensForWordsWithAlphabeticCharacters() {
266        String content = "This is a series of words.";
267        tokenize(content);
268        assertNextTokenIs(0, 4, BasicTokenizer.WORD);
269        assertNextTokenIs(5, 7, BasicTokenizer.WORD);
270        assertNextTokenIs(8, 9, BasicTokenizer.WORD);
271        assertNextTokenIs(10, 16, BasicTokenizer.WORD);
272        assertNextTokenIs(17, 19, BasicTokenizer.WORD);
273        assertNextTokenIs(20, 25, BasicTokenizer.WORD);
274        assertNextTokenIs(25, 26, BasicTokenizer.DECIMAL);
275        assertNoMoreTokens();
276    }
277
278    @Test
279    public void shouldCreateTokensForWordsWithNumericCharacters() {
280        String content = "1234 4 5353.324";
281        tokenize(content);
282        assertNextTokenIs(0, 4, BasicTokenizer.WORD);
283        assertNextTokenIs(5, 6, BasicTokenizer.WORD);
284        assertNextTokenIs(7, 11, BasicTokenizer.WORD);
285        assertNextTokenIs(11, 12, BasicTokenizer.DECIMAL);
286        assertNextTokenIs(12, 15, BasicTokenizer.WORD);
287        assertNoMoreTokens();
288    }
289
290    @Test
291    public void shouldCreateTokensForWordsWithAlphaNumericCharacters() {
292        String content = "123a 5353.324e100";
293        tokenize(content);
294        assertNextTokenIs(0, 4, BasicTokenizer.WORD);
295        assertNextTokenIs(5, 9, BasicTokenizer.WORD);
296        assertNextTokenIs(9, 10, BasicTokenizer.DECIMAL);
297        assertNextTokenIs(10, 17, BasicTokenizer.WORD);
298        assertNoMoreTokens();
299    }
300}