1 module lexer; 2 3 4 import std.ascii; 5 import std.array; 6 import std.exception; 7 import std.string; 8 9 10 class LexerException : Throwable { 11 this(Lexer.State state, string msg) { 12 super(msg); 13 this.state = state; 14 } 15 16 Lexer.State state; 17 } 18 19 20 enum Tok : ubyte { 21 Invalid = 0, 22 EndOfInput, 23 Char, 24 Literal, 25 Identifier, 26 } 27 28 29 enum Literal : ubyte { 30 None = 0, 31 Decimal, 32 Hex, 33 Float, 34 String, 35 } 36 37 38 struct Lexer { 39 this(ubyte[] source) { 40 state.original = source; 41 state.source = source; 42 state.tok = Tok.Invalid; 43 state.literal = Literal.None; 44 popFront; 45 } 46 47 Tok next() { 48 popFront; 49 return state.tok; 50 } 51 52 string nextValue() { 53 popFront; 54 return cast(string)state.value; 55 } 56 57 Tok pop() { 58 Tok tok = state.tok; 59 popFront; 60 return tok; 61 } 62 63 string popValue() { 64 string value = cast(string)state.value; 65 popFront; 66 return value; 67 } 68 69 void popFront() { 70 expectNot(Tok.EndOfInput); 71 72 auto cursor = state.source.ptr; 73 auto end = state.source.ptr + state.source.length; 74 75 skipWhites(cursor, end); 76 scope(exit) { 77 skipWhites(cursor, end); 78 state.source = state.source[cursor - state.source.ptr..$]; 79 } 80 81 while (cursor != end) { 82 auto tokStart = cursor; 83 state.start = cast(size_t)(tokStart - state.source.ptr); 84 state.literal = Literal.None; 85 86 auto ch = *cursor++; 87 88 switch(ch) { 89 case '.': 90 case '0': .. case '9': 91 if ((ch == '.') && !isDigit(*cursor)) 92 goto default; 93 94 auto suffix = false; 95 state.tok = Tok.Literal; 96 state.literal = (ch == '.') ? Literal.Float : Literal.Decimal; 97 98 while (cursor != end) { 99 ch = *cursor; 100 switch(ch) { 101 case '_': 102 enforce(*(cursor - 1) != '_', cursor, "unexpected digit grouping '_' in literal"); 103 ++cursor; 104 continue; 105 case '.': 106 enforce(state.literal != Literal.Hex, cursor, "unexpected '.' in hex literal"); 107 auto next = std.ascii.toLower(peek(cursor, 1, end)); 108 if (next && !isDigit(next) && (next != 'f')) 109 break; 110 enforce(state.literal != Literal.Float, cursor, "unexpected '.' in float literal"); 111 state.literal = Literal.Float; 112 ++cursor; 113 continue; 114 case 'x': 115 enforce(state.literal != Literal.Hex, cursor, "unexpected second 'x' in hex literal"); 116 enforce((*(cursor - 1) == '0') && (cursor - 1 == tokStart), cursor, "unexpected 'x' in numeric literal"); 117 state.tok = Tok.Literal; 118 state.literal = Literal.Hex; 119 tokStart += 2; 120 ++cursor; 121 continue; 122 default: 123 if (isDigit(ch)) { 124 ++cursor; 125 continue; 126 } else if (isAlpha(ch)) { 127 if (state.literal == Literal.Hex) { 128 ch = std.ascii.toLower(*cursor); 129 enforce((ch >= 'a') && (ch <= 'f'), cursor, "unexpected '%c' in hex literal", cast(char)ch); 130 ++cursor; 131 continue; 132 } else if ((state.literal == Literal.Float) && (ch == 'f')) { 133 ++cursor; 134 suffix = true; 135 break; 136 } 137 enforce(isWhite(ch), cursor, "unexpected '%c' in literal", cast(char)*cursor); 138 } 139 break; 140 } 141 break; 142 } 143 state.value = tokStart[0..cursor - tokStart - (suffix ? 1 : 0)]; 144 return; 145 case '"': case '\'': case '`': 146 cursor = skipUntil(cursor, end, ch, ch != '`'); 147 enforce(*cursor == ch, cursor, "unexpected end of input in string literal"); 148 ++cursor; 149 state.value = tokStart[0..cursor - tokStart]; 150 state.tok = Tok.Literal; 151 state.literal = Literal.String; 152 return; 153 case 'a': .. case 'z': 154 case 'A': .. case 'Z': 155 case '_': 156 while ((cursor != end) && (isAlphaNum(*cursor) || (*cursor == '_'))) 157 ++cursor; 158 state.value = tokStart[0..cursor - tokStart]; 159 state.tok = Tok.Identifier; 160 return; 161 default: 162 if (isASCII(ch)) { 163 state.value = tokStart[0..cursor - tokStart]; 164 state.tok = Tok.Char; 165 return; 166 } 167 break; 168 } 169 } 170 171 state.tok = Tok.EndOfInput; 172 } 173 174 bool empty() nothrow { 175 return state.tok == Tok.EndOfInput; 176 } 177 178 Tok front() nothrow { 179 return state.tok; 180 } 181 182 Literal literal() nothrow { 183 return state.literal; 184 } 185 186 string value() nothrow { 187 return cast(string)state.value; 188 } 189 190 string remaining() nothrow { 191 return cast(string)state.source; 192 } 193 194 size_t cursor() nothrow { 195 return state.original.length - state.source.length; 196 } 197 198 Tok popUntil(Args...)(auto ref Args args) { 199 while (true) { 200 auto tok = next; 201 foreach(i, Arg; Args) { 202 static assert(is(Arg == Tok) || is(Arg == string)); 203 static if (is(Arg == Tok)) { 204 if (state.tok == args[i]) 205 return state.tok; 206 } else static if (is(Arg == string)) { 207 if ((state.tok == Tok.Char) && (state.value == args[i])) 208 return state.tok; 209 } 210 } 211 } 212 213 import std.format; 214 Appender!string exception; 215 216 foreach(i, arg; args) { 217 if (i == 0) { 218 formattedWrite(&exception, "expected '%s'", arg); 219 } else if (i + 1 == args.length) { 220 formattedWrite(&exception, ", or '%s'", arg); 221 } else { 222 formattedWrite(&exception, ", '%s'", arg); 223 } 224 } 225 226 if (state.tok == Tok.Char) { 227 formattedWrite(&exception, " but found '%s'", state.value); 228 } else { 229 formattedWrite(&exception, " but found '%s'", state.tok); 230 } 231 232 throw new LexerException(state, exception.data); 233 return state.tok; 234 } 235 236 Tok popUntilBalanced(string close, string open) { 237 assert(value == open); 238 assert(close != open); 239 240 size_t opened = 1; 241 while (opened) { 242 popFront; 243 if (state.tok == Tok.Char) { 244 if (state.value == close) { 245 if (opened > 0) 246 --opened; 247 } else if (state.value == open) { 248 ++opened; 249 } 250 } else if (state.tok == Tok.EndOfInput) { 251 expectNot(Tok.EndOfInput); 252 } 253 } 254 255 return state.tok; 256 } 257 258 auto expectNot(Args...)(auto ref Args args) { 259 foreach(i, Arg; Args) { 260 static assert(is(Arg == Tok) || is(Arg == string)); 261 static if (is(Arg == Tok)) { 262 if (state.tok != args[i]) 263 continue; 264 } else static if (is(Arg == string)) { 265 if ((state.tok != Tok.Char) || (state.value != args[i])) 266 continue; 267 } 268 269 { 270 static if (is(Arg == Tok)) { 271 throw new Exception(format("unexpected '%s'", args[i])); 272 } else { 273 throw new Exception(format("unexpected '%s'", state.value)); 274 } 275 } 276 } 277 278 return state.tok; 279 } 280 281 auto expect(Args...)(auto ref Args args) { 282 foreach(i, Arg; Args) { 283 static assert(is(Arg == Tok) || is(Arg == string)); 284 static if (is(Arg == Tok)) { 285 if (state.tok == args[i]) 286 return state.tok; 287 } else static if (is(Arg == string)) { 288 if ((state.tok == Tok.Char) && (state.value == args[i])) 289 return state.tok; 290 } 291 } 292 293 import std.format; 294 Appender!string exception; 295 296 foreach(i, arg; args) { 297 if (i == 0) { 298 formattedWrite(&exception, "expected '%s'", arg); 299 } else if (i + 1 == args.length) { 300 formattedWrite(&exception, ", or '%s'", arg); 301 } else { 302 formattedWrite(&exception, ", '%s'", arg); 303 } 304 } 305 306 if (state.tok == Tok.Char) { 307 formattedWrite(&exception, " but found '%s'", state.value); 308 } else { 309 formattedWrite(&exception, " but found '%s'", state.tok); 310 } 311 312 throw new LexerException(state, exception.data); 313 } 314 315 int opApply(scope int delegate(Tok tok, string val) del) { 316 while(!empty) { 317 auto tok = state.tok; 318 auto value = cast(string)state.value; 319 popFront; 320 if (auto ret = del(tok, value)) 321 return ret; 322 } 323 return 0; 324 } 325 326 int opApply(scope int delegate(Tok tok, Literal literal, string val) del) { 327 while(!empty) { 328 auto tok = state.tok; 329 auto value = cast(string)state.value; 330 auto literal = state.literal; 331 popFront; 332 if (auto ret = del(tok, literal, value)) 333 return ret; 334 } 335 return 0; 336 } 337 338 private: 339 ubyte peek(ubyte* cursor, int offset, ubyte* end) { 340 auto ptr = cursor + offset; 341 if (ptr < end) 342 return *ptr; 343 return 0; 344 } 345 346 ref ubyte* skipWhites(ref ubyte* cursor, ubyte* end) { 347 while ((cursor != end) && isWhite(*cursor)) { 348 if (*cursor == '\n') { 349 ++state.line; 350 state.lineStart = cast(size_t)((cursor + 1) - state.source.ptr); 351 } 352 ++cursor; 353 state.source = state.source[cursor - state.source.ptr..$]; 354 } 355 return cursor; 356 } 357 358 ref ubyte* skipUntil(ref ubyte* cursor, ubyte* end, ubyte ch, bool escape = false) { 359 while ((cursor != end) && (*cursor != ch)) { 360 if (escape && (*cursor == '\\')) { 361 ++cursor; 362 enforce(cursor != end, cursor, "incomplete escape sequence", cast(char)*cursor); 363 } 364 if (*cursor == '\n') { 365 ++state.line; 366 state.lineStart = cast(size_t)((cursor + 1) - state.source.ptr); 367 } 368 ++cursor; 369 state.source = state.source[cursor - state.source.ptr..$]; 370 } 371 return cursor; 372 } 373 374 void enforce(Args...)(bool value, ubyte* cursor, string fmt, Args args) { 375 if (!value) { 376 state.offset = cast(size_t)(cursor - state.source.ptr); 377 throw new LexerException(state, format(fmt, args)); 378 } 379 } 380 381 struct State { 382 size_t line; 383 size_t lineStart; 384 size_t start; 385 size_t offset; 386 ubyte[] original; 387 ubyte[] source; 388 ubyte[] value; 389 Tok tok = Tok.EndOfInput; 390 Literal literal = Literal.None; 391 } 392 393 State state; 394 } 395 396 397 398 void testLexer() { 399 import std.stdio; 400 void expect(string source, Tok tok, string value, bool one = true) { 401 auto lex = Lexer(cast(ubyte[])source); 402 assert(lex.front == tok); 403 assert(lex.value == value); 404 if (one) { 405 lex.popFront; 406 assert(lex.empty); 407 } 408 } 409 410 void expectException(string source) { 411 try { 412 auto lex = Lexer(cast(ubyte[])source); 413 } catch (LexerException e) { 414 return; 415 } 416 assert(false); 417 } 418 419 expect("foo", Tok.Identifier, "foo"); 420 expect("_foo", Tok.Identifier, "_foo"); 421 expect("_fo_o_", Tok.Identifier, "_fo_o_"); 422 423 expect(`"foo"`, Tok.Literal, `"foo"`); 424 expect(`'foo'`, Tok.Literal, `'foo'`); 425 426 expect("1337", Tok.Literal, "1337"); 427 expect("13_37", Tok.Literal, "13_37"); 428 expect("0xbeef", Tok.Literal, "beef"); 429 expect("1.0", Tok.Literal, "1.0"); 430 expect("1.0f", Tok.Literal, "1.0"); 431 expect(".1", Tok.Literal, ".1"); 432 expect(".1f", Tok.Literal, ".1"); 433 expect("1.", Tok.Literal, "1."); 434 expect("1.f", Tok.Literal, "1."); 435 expect("1..", Tok.Literal, "1", false); 436 expect("1...", Tok.Literal, "1", false); 437 expect("1[]", Tok.Literal, "1", false); 438 expect("123[]", Tok.Literal, "123", false); 439 expect("1.h", Tok.Literal, "1", false); 440 441 expect(".", Tok.Char, "."); 442 expect("(", Tok.Char, "("); 443 expect("+", Tok.Char, "+"); 444 445 expectException(`"foo`); 446 expectException(`'foo`); 447 expectException(`1__2`); 448 expectException(`0xabcdefg`); 449 expectException(`1.0h`); 450 } 451 452 unittest { 453 testLexer(); 454 }