001/* 002 * ModeShape (http://www.modeshape.org) 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package org.modeshape.common.text; 017 018import static org.hamcrest.core.Is.is; 019import static org.junit.Assert.assertThat; 020import java.util.LinkedList; 021import org.modeshape.common.text.TokenStream.BasicTokenizer; 022import org.modeshape.common.text.TokenStream.CharacterArrayStream; 023import org.modeshape.common.text.TokenStream.Tokens; 024import org.junit.Before; 025import org.junit.Test; 026 027/** 028 * 029 */ 030public class TokenStreamBasicTokenizerTest { 031 032 private BasicTokenizer tokenizer; 033 private Tokens tokenFactory; 034 private LinkedList<int[]> tokenValues; 035 036 @Before 037 public void beforeEach() { 038 tokenizer = TokenStream.basicTokenizer(true); 039 final LinkedList<int[]> tokenValues = new LinkedList<int[]>(); 040 tokenFactory = new Tokens() { 041 @Override 042 public void addToken( Position position, 043 int index ) { 044 int[] token = new int[] {index, index + 1, 0}; 045 tokenValues.add(token); 046 } 047 048 @Override 049 public void addToken( Position position, 050 int startIndex, 051 int endIndex ) { 052 int[] token = new int[] {startIndex, endIndex, 0}; 053 tokenValues.add(token); 054 } 055 056 @Override 057 public void addToken( Position position, 058 int startIndex, 059 int endIndex, 060 int type ) { 061 int[] token = new int[] {startIndex, endIndex, type}; 062 tokenValues.add(token); 063 } 064 }; 065 this.tokenValues = tokenValues; 066 } 067 068 protected void tokenize( String input ) { 069 tokenizer.tokenize(new CharacterArrayStream(input.toCharArray()), tokenFactory); 070 } 071 072 protected void assertNextTokenIs( int startIndex, 073 int endIndex, 074 int type ) { 075 int[] token = tokenValues.removeFirst(); 076 assertThat(token[0], is(startIndex)); 077 assertThat(token[1], is(endIndex)); 078 assertThat(token[2], is(type)); 079 } 080 081 protected void assertNoMoreTokens() { 082 assertThat(tokenValues.isEmpty(), is(true)); 083 } 084 085 @Test 086 public void shouldCreateNoTokensForEmptyContent() { 087 tokenize(""); 088 assertNoMoreTokens(); 089 } 090 091 @Test 092 public void shouldCreateNoTokensForContentWithOnlyWhitespace() { 093 tokenize(" \t \n \r\n \r "); 094 assertNoMoreTokens(); 095 } 096 097 @Test 098 public void shouldCreateTokenForEachSymbolCharacter() { 099 String content = "-(){}*,;+%?$[]!<>|=:"; 100 int numSymbols = content.length(); 101 tokenize(content); 102 for (int i = 0; i != numSymbols; ++i) { 103 assertNextTokenIs(i, i + 1, BasicTokenizer.SYMBOL); 104 } 105 assertNoMoreTokens(); 106 } 107 108 @Test 109 public void shouldCreateTokenForEachDecimalCharacter() { 110 tokenize("."); 111 assertNextTokenIs(0, 1, BasicTokenizer.DECIMAL); 112 assertNoMoreTokens(); 113 } 114 115 @Test 116 public void shouldCreateTokenForEndOfLineComment() { 117 String content = "--//this is a comment\n"; 118 tokenize(content); 119 assertNextTokenIs(0, 1, BasicTokenizer.SYMBOL); 120 assertNextTokenIs(1, 2, BasicTokenizer.SYMBOL); 121 assertNextTokenIs(2, content.length() - 1, BasicTokenizer.COMMENT); // -1 because '\n' is not included 122 assertNoMoreTokens(); 123 } 124 125 @Test 126 public void shouldCreateTokenForEndOfLineCommentThatEndsWithEndOfString() { 127 String content = "--//this is a comment"; 128 tokenize(content); 129 assertNextTokenIs(0, 1, BasicTokenizer.SYMBOL); 130 assertNextTokenIs(1, 2, BasicTokenizer.SYMBOL); 131 assertNextTokenIs(2, content.length(), BasicTokenizer.COMMENT); 132 assertNoMoreTokens(); 133 } 134 135 @Test 136 public void shouldCreateTokenForMultiLineComment() { 137 String content = "--/*this is a comment*/-"; 138 tokenize(content); 139 assertNextTokenIs(0, 1, BasicTokenizer.SYMBOL); 140 assertNextTokenIs(1, 2, BasicTokenizer.SYMBOL); 141 assertNextTokenIs(2, content.length() - 1, BasicTokenizer.COMMENT); 142 assertNextTokenIs(content.length() - 1, content.length(), BasicTokenizer.SYMBOL); 143 assertNoMoreTokens(); 144 } 145 146 @Test 147 public void shouldCreateTokenForMultiLineCommentAtEndOfContent() { 148 String content = "--/*this is a comment*/"; 149 tokenize(content); 150 assertNextTokenIs(0, 1, BasicTokenizer.SYMBOL); 151 assertNextTokenIs(1, 2, BasicTokenizer.SYMBOL); 152 assertNextTokenIs(2, content.length(), BasicTokenizer.COMMENT); 153 assertNoMoreTokens(); 154 } 155 156 @Test 157 public void shouldCreateTokenForMultiLineCommentWithoutTerminatingCharacters() { 158 String content = "--/*this is a comment"; 159 tokenize(content); 160 assertNextTokenIs(0, 1, BasicTokenizer.SYMBOL); 161 assertNextTokenIs(1, 2, BasicTokenizer.SYMBOL); 162 assertNextTokenIs(2, content.length(), BasicTokenizer.COMMENT); 163 assertNoMoreTokens(); 164 } 165 166 @Test 167 public void shouldCreateTokenForMultiLineCommentWithoutAllTerminatingCharacters() { 168 String content = "--/*this is a comment*"; 169 tokenize(content); 170 assertNextTokenIs(0, 1, BasicTokenizer.SYMBOL); 171 assertNextTokenIs(1, 2, BasicTokenizer.SYMBOL); 172 assertNextTokenIs(2, content.length(), BasicTokenizer.COMMENT); 173 assertNoMoreTokens(); 174 } 175 176 @Test 177 public void shouldCreateTokenForSingleQuotedString() { 178 String content = "--'this is a single-quoted \n string'-"; 179 assertThat(content.charAt(2), is('\'')); 180 assertThat(content.charAt(35), is('\'')); 181 tokenize(content); 182 assertNextTokenIs(0, 1, BasicTokenizer.SYMBOL); 183 assertNextTokenIs(1, 2, BasicTokenizer.SYMBOL); 184 assertNextTokenIs(2, 36, BasicTokenizer.SINGLE_QUOTED_STRING); 185 assertNextTokenIs(36, 37, BasicTokenizer.SYMBOL); 186 assertNoMoreTokens(); 187 } 188 189 @Test 190 public void shouldCreateTokenForSingleQuotedStringWithEscapedSingleQuoteCharacters() { 191 String content = "--'this \"is\" a \\'single-quoted\\' \n string'-"; 192 assertThat(content.charAt(2), is('\'')); 193 assertThat(content.charAt(41), is('\'')); 194 tokenize(content); 195 assertNextTokenIs(0, 1, BasicTokenizer.SYMBOL); 196 assertNextTokenIs(1, 2, BasicTokenizer.SYMBOL); 197 assertNextTokenIs(2, 42, BasicTokenizer.SINGLE_QUOTED_STRING); 198 assertNextTokenIs(42, 43, BasicTokenizer.SYMBOL); 199 assertNoMoreTokens(); 200 } 201 202 @Test 203 public void shouldCreateTokenForSingleQuotedStringAtEndOfContent() { 204 String content = "--'this is a single-quoted \n string'"; 205 assertThat(content.charAt(2), is('\'')); 206 assertThat(content.charAt(35), is('\'')); 207 tokenize(content); 208 assertNextTokenIs(0, 1, BasicTokenizer.SYMBOL); 209 assertNextTokenIs(1, 2, BasicTokenizer.SYMBOL); 210 assertNextTokenIs(2, 36, BasicTokenizer.SINGLE_QUOTED_STRING); 211 assertNoMoreTokens(); 212 } 213 214 @Test( expected = ParsingException.class ) 215 public void shouldCreateTokenForSingleQuotedStringWithoutClosingQuote() { 216 String content = "--'this is a single-quoted \n string"; 217 tokenize(content); 218 } 219 220 @Test 221 public void shouldCreateTokenForDoubleQuotedString() { 222 String content = "--\"this is a double-quoted \n string\"-"; 223 assertThat(content.charAt(2), is('"')); 224 assertThat(content.charAt(35), is('"')); 225 tokenize(content); 226 assertNextTokenIs(0, 1, BasicTokenizer.SYMBOL); 227 assertNextTokenIs(1, 2, BasicTokenizer.SYMBOL); 228 assertNextTokenIs(2, 36, BasicTokenizer.DOUBLE_QUOTED_STRING); 229 assertNextTokenIs(36, 37, BasicTokenizer.SYMBOL); 230 assertNoMoreTokens(); 231 } 232 233 @Test 234 public void shouldCreateTokenForDoubleQuotedStringWithEscapedDoubleQuoteCharacters() { 235 String content = "--\"this 'is' a \\\"double-quoted\\\" \n string\"-"; 236 assertThat(content.charAt(2), is('"')); 237 assertThat(content.charAt(41), is('"')); 238 tokenize(content); 239 assertNextTokenIs(0, 1, BasicTokenizer.SYMBOL); 240 assertNextTokenIs(1, 2, BasicTokenizer.SYMBOL); 241 assertNextTokenIs(2, 42, BasicTokenizer.DOUBLE_QUOTED_STRING); 242 assertNextTokenIs(42, 43, BasicTokenizer.SYMBOL); 243 assertNoMoreTokens(); 244 } 245 246 @Test 247 public void shouldCreateTokenForDoubleQuotedStringAtEndOfContent() { 248 String content = "--\"this is a double-quoted \n string\""; 249 assertThat(content.charAt(2), is('"')); 250 assertThat(content.charAt(35), is('"')); 251 tokenize(content); 252 assertNextTokenIs(0, 1, BasicTokenizer.SYMBOL); 253 assertNextTokenIs(1, 2, BasicTokenizer.SYMBOL); 254 assertNextTokenIs(2, 36, BasicTokenizer.DOUBLE_QUOTED_STRING); 255 assertNoMoreTokens(); 256 } 257 258 @Test( expected = ParsingException.class ) 259 public void shouldCreateTokenForDoubleQuotedStringWithoutClosingQuote() { 260 String content = "--\"this is a double-quoted \n string"; 261 tokenize(content); 262 } 263 264 @Test 265 public void shouldCreateTokensForWordsWithAlphabeticCharacters() { 266 String content = "This is a series of words."; 267 tokenize(content); 268 assertNextTokenIs(0, 4, BasicTokenizer.WORD); 269 assertNextTokenIs(5, 7, BasicTokenizer.WORD); 270 assertNextTokenIs(8, 9, BasicTokenizer.WORD); 271 assertNextTokenIs(10, 16, BasicTokenizer.WORD); 272 assertNextTokenIs(17, 19, BasicTokenizer.WORD); 273 assertNextTokenIs(20, 25, BasicTokenizer.WORD); 274 assertNextTokenIs(25, 26, BasicTokenizer.DECIMAL); 275 assertNoMoreTokens(); 276 } 277 278 @Test 279 public void shouldCreateTokensForWordsWithNumericCharacters() { 280 String content = "1234 4 5353.324"; 281 tokenize(content); 282 assertNextTokenIs(0, 4, BasicTokenizer.WORD); 283 assertNextTokenIs(5, 6, BasicTokenizer.WORD); 284 assertNextTokenIs(7, 11, BasicTokenizer.WORD); 285 assertNextTokenIs(11, 12, BasicTokenizer.DECIMAL); 286 assertNextTokenIs(12, 15, BasicTokenizer.WORD); 287 assertNoMoreTokens(); 288 } 289 290 @Test 291 public void shouldCreateTokensForWordsWithAlphaNumericCharacters() { 292 String content = "123a 5353.324e100"; 293 tokenize(content); 294 assertNextTokenIs(0, 4, BasicTokenizer.WORD); 295 assertNextTokenIs(5, 9, BasicTokenizer.WORD); 296 assertNextTokenIs(9, 10, BasicTokenizer.DECIMAL); 297 assertNextTokenIs(10, 17, BasicTokenizer.WORD); 298 assertNoMoreTokens(); 299 } 300}