1 module lexer;
2 
3 
4 import std.ascii;
5 import std.array;
6 import std.exception;
7 import std.string;
8 
9 
10 class LexerException : Throwable {
11 	this(Lexer.State state, string msg) {
12 		super(msg);
13 		this.state = state;
14 	}
15 
16 	Lexer.State state;
17 }
18 
19 
20 enum Tok : ubyte {
21 	Invalid = 0,
22 	EndOfInput,
23 	Char,
24 	Literal,
25 	Identifier,
26 }
27 
28 
29 enum Literal : ubyte {
30 	None = 0,
31 	Decimal,
32 	Hex,
33 	Float,
34 	String,
35 }
36 
37 
38 struct Lexer {
39 	this(ubyte[] source) {
40 		state.original = source;
41 		state.source = source;
42 		state.tok = Tok.Invalid;
43 		state.literal = Literal.None;
44 		popFront;
45 	}
46 
47 	Tok next() {
48 		popFront;
49 		return state.tok;
50 	}
51 
52 	string nextValue() {
53 		popFront;
54 		return cast(string)state.value;
55 	}
56 
57 	Tok pop() {
58 		Tok tok = state.tok;
59 		popFront;
60 		return tok;
61 	}
62 
63 	string popValue() {
64 		string value = cast(string)state.value;
65 		popFront;
66 		return value;
67 	}
68 
69 	void popFront() {
70 		expectNot(Tok.EndOfInput);
71 
72 		auto cursor = state.source.ptr;
73 		auto end = state.source.ptr + state.source.length;
74 
75 		skipWhites(cursor, end);
76 		scope(exit) {
77 			skipWhites(cursor, end);
78 			state.source = state.source[cursor - state.source.ptr..$];
79 		}
80 
81 		while (cursor != end) {
82 			auto tokStart = cursor;
83 			state.start = cast(size_t)(tokStart - state.source.ptr);
84 			state.literal = Literal.None;
85 
86 			auto ch = *cursor++;
87 
88 			switch(ch) {
89 				case '.':
90 				case '0': .. case '9':
91 					if ((ch == '.') && !isDigit(*cursor))
92 						goto default;
93 
94 					auto suffix = false;
95 					state.tok = Tok.Literal;
96 					state.literal = (ch == '.') ? Literal.Float : Literal.Decimal;
97 
98 					while (cursor != end) {
99 						ch = *cursor;
100 						switch(ch) {
101 							case '_':
102 								enforce(*(cursor - 1) != '_', cursor, "unexpected digit grouping '_' in literal");
103 								++cursor;
104 								continue;
105 							case '.':
106 								enforce(state.literal != Literal.Hex, cursor, "unexpected '.' in hex literal");
107 								auto next = std.ascii.toLower(peek(cursor, 1, end));
108 								if (next && !isDigit(next) && (next != 'f'))
109 									break;
110 								enforce(state.literal != Literal.Float, cursor, "unexpected '.' in float literal");
111 								state.literal = Literal.Float;
112 								++cursor;
113 								continue;
114 							case 'x':
115 								enforce(state.literal != Literal.Hex, cursor, "unexpected second 'x' in hex literal");
116 								enforce((*(cursor - 1) == '0') && (cursor - 1 == tokStart), cursor, "unexpected 'x' in numeric literal");
117 								state.tok = Tok.Literal;
118 								state.literal = Literal.Hex;
119 								tokStart += 2;
120 								++cursor;
121 								continue;
122 							default:
123 								if (isDigit(ch)) {
124 									++cursor;
125 									continue;
126 								} else if (isAlpha(ch)) {
127 									if (state.literal == Literal.Hex) {
128 										ch = std.ascii.toLower(*cursor);
129 										enforce((ch >= 'a') && (ch <= 'f'), cursor, "unexpected '%c' in hex literal", cast(char)ch);
130 										++cursor;
131 										continue;
132 									} else if ((state.literal == Literal.Float) && (ch == 'f')) {
133 										++cursor;
134 										suffix = true;
135 										break;
136 									}
137 									enforce(isWhite(ch), cursor, "unexpected '%c' in literal", cast(char)*cursor);
138 								}
139 								break;
140 						}
141 						break;
142 					}
143 					state.value = tokStart[0..cursor - tokStart - (suffix ? 1 : 0)];
144 					return;
145 				case '"': case '\'': case '`':
146 					cursor = skipUntil(cursor, end, ch, ch != '`');
147 					enforce(*cursor == ch, cursor, "unexpected end of input in string literal");
148 					++cursor;
149 					state.value = tokStart[0..cursor - tokStart];
150 					state.tok = Tok.Literal;
151 					state.literal = Literal.String;
152 					return;
153 				case 'a': .. case 'z':
154 				case 'A': .. case 'Z':
155 				case '_':
156 					while ((cursor != end) && (isAlphaNum(*cursor) || (*cursor == '_')))
157 						++cursor;
158 					state.value = tokStart[0..cursor - tokStart];
159 					state.tok = Tok.Identifier;
160 					return;
161 				default:
162 					if (isASCII(ch)) {
163 						state.value = tokStart[0..cursor - tokStart];
164 						state.tok = Tok.Char;
165 						return;
166 					}
167 					break;
168 			}
169 		}
170 
171 		state.tok = Tok.EndOfInput;
172 	}
173 
174 	bool empty() nothrow {
175 		return state.tok == Tok.EndOfInput;
176 	}
177 
178 	Tok front() nothrow {
179 		return state.tok;
180 	}
181 
182 	Literal literal() nothrow {
183 		return state.literal;
184 	}
185 
186 	string value() nothrow {
187 		return cast(string)state.value;
188 	}
189 
190 	string remaining() nothrow {
191 		return cast(string)state.source;
192 	}
193 
194 	size_t cursor() nothrow {
195 		return state.original.length - state.source.length;
196 	}
197 
198 	Tok popUntil(Args...)(auto ref Args args) {
199 		while (true) {
200 			auto tok = next;
201 			foreach(i, Arg; Args) {
202 				static assert(is(Arg == Tok) || is(Arg == string));
203 				static if (is(Arg == Tok)) {
204 					if (state.tok == args[i])
205 						return state.tok;
206 				} else static if (is(Arg == string)) {
207 					if ((state.tok == Tok.Char) && (state.value == args[i]))
208 						return state.tok;
209 				}
210 			}
211 		}
212 
213 		import std.format;
214 		Appender!string exception;
215 
216 		foreach(i, arg; args) {
217 			if (i == 0) {
218 				formattedWrite(&exception, "expected '%s'", arg);
219 			} else if (i + 1 == args.length) {
220 				formattedWrite(&exception, ", or '%s'", arg);
221 			} else {
222 				formattedWrite(&exception, ", '%s'", arg);
223 			}
224 		}
225 
226 		if (state.tok == Tok.Char) {
227 			formattedWrite(&exception, " but found '%s'", state.value);
228 		} else {
229 			formattedWrite(&exception, " but found '%s'", state.tok);
230 		}
231 
232 		throw new LexerException(state, exception.data);
233 		return state.tok;
234 	}
235 
236 	Tok popUntilBalanced(string close, string open) {
237 		assert(value == open);
238 		assert(close != open);
239 
240 		size_t opened = 1;
241 		while (opened) {
242 			popFront;
243 			if (state.tok == Tok.Char) {
244 				if (state.value == close) {
245 					if (opened > 0)
246 						--opened;
247 				} else if (state.value == open) {
248 					++opened;
249 				}
250 			} else if (state.tok == Tok.EndOfInput) {
251 				expectNot(Tok.EndOfInput);
252 			}
253 		}
254 
255 		return state.tok;
256 	}
257 
258 	auto expectNot(Args...)(auto ref Args args) {
259 		foreach(i, Arg; Args) {
260 			static assert(is(Arg == Tok) || is(Arg == string));
261 			static if (is(Arg == Tok)) {
262 				if (state.tok != args[i])
263 					continue;
264 			} else static if (is(Arg == string)) {
265 				if ((state.tok != Tok.Char) || (state.value != args[i]))
266 					continue;
267 			}
268 
269 			{
270 				static if (is(Arg == Tok)) {
271 					throw new Exception(format("unexpected '%s'", args[i]));
272 				} else {
273 					throw new Exception(format("unexpected '%s'", state.value));
274 				}
275 			}
276 		}
277 
278 		return state.tok;
279 	}
280 
281 	auto expect(Args...)(auto ref Args args) {
282 		foreach(i, Arg; Args) {
283 			static assert(is(Arg == Tok) || is(Arg == string));
284 			static if (is(Arg == Tok)) {
285 				if (state.tok == args[i])
286 					return state.tok;
287 			} else static if (is(Arg == string)) {
288 				if ((state.tok == Tok.Char) && (state.value == args[i]))
289 					return state.tok;
290 			}
291 		}
292 
293 		import std.format;
294 		Appender!string exception;
295 
296 		foreach(i, arg; args) {
297 			if (i == 0) {
298 				formattedWrite(&exception, "expected '%s'", arg);
299 			} else if (i + 1 == args.length) {
300 				formattedWrite(&exception, ", or '%s'", arg);
301 			} else {
302 				formattedWrite(&exception, ", '%s'", arg);
303 			}
304 		}
305 
306 		if (state.tok == Tok.Char) {
307 			formattedWrite(&exception, " but found '%s'", state.value);
308 		} else {
309 			formattedWrite(&exception, " but found '%s'", state.tok);
310 		}
311 
312 		throw new LexerException(state, exception.data);
313 	}
314 
315 	int opApply(scope int delegate(Tok tok, string val) del) {
316 		while(!empty) {
317 			auto tok = state.tok;
318 			auto value = cast(string)state.value;
319 			popFront;
320 			if (auto ret = del(tok, value))
321 				return ret;
322 		}
323 		return 0;
324 	}
325 
326 	int opApply(scope int delegate(Tok tok, Literal literal, string val) del) {
327 		while(!empty) {
328 			auto tok = state.tok;
329 			auto value = cast(string)state.value;
330 			auto literal = state.literal;
331 			popFront;
332 			if (auto ret = del(tok, literal, value))
333 				return ret;
334 		}
335 		return 0;
336 	}
337 
338 private:
339 	ubyte peek(ubyte* cursor, int offset, ubyte* end) {
340 		auto ptr = cursor + offset;
341 		if (ptr < end)
342 			return *ptr;
343 		return 0;
344 	}
345 
346 	ref ubyte* skipWhites(ref ubyte* cursor, ubyte* end) {
347 		while ((cursor != end) && isWhite(*cursor)) {
348 			if (*cursor == '\n') {
349 				++state.line;
350 				state.lineStart = cast(size_t)((cursor + 1) - state.source.ptr);
351 			}
352 			++cursor;
353 			state.source = state.source[cursor - state.source.ptr..$];
354 		}
355 		return cursor;
356 	}
357 
358 	ref ubyte* skipUntil(ref ubyte* cursor, ubyte* end, ubyte ch, bool escape = false) {
359 		while ((cursor != end) && (*cursor != ch)) {
360 			if (escape && (*cursor == '\\')) {
361 				++cursor;
362 				enforce(cursor != end, cursor, "incomplete escape sequence", cast(char)*cursor);
363 			}
364 			if (*cursor == '\n') {
365 				++state.line;
366 				state.lineStart = cast(size_t)((cursor + 1) - state.source.ptr);
367 			}
368 			++cursor;
369 			state.source = state.source[cursor - state.source.ptr..$];
370 		}
371 		return cursor;
372 	}
373 
374 	void enforce(Args...)(bool value, ubyte* cursor, string fmt, Args args) {
375 		if (!value) {
376 			state.offset = cast(size_t)(cursor - state.source.ptr);
377 			throw new LexerException(state, format(fmt, args));
378 		}
379 	}
380 
381 	struct State {
382 		size_t line;
383 		size_t lineStart;
384 		size_t start;
385 		size_t offset;
386 		ubyte[] original;
387 		ubyte[] source;
388 		ubyte[] value;
389 		Tok tok = Tok.EndOfInput;
390 		Literal literal = Literal.None;
391 	}
392 
393 	State state;
394 }
395 
396 
397 
398 void testLexer() {
399 	import std.stdio;
400 	void expect(string source, Tok tok, string value, bool one = true) {
401 		auto lex = Lexer(cast(ubyte[])source);
402 		assert(lex.front == tok);
403 		assert(lex.value == value);
404 		if (one) {
405 			lex.popFront;
406 			assert(lex.empty);
407 		}
408 	}
409 
410 	void expectException(string source) {
411 		try {
412 			auto lex = Lexer(cast(ubyte[])source);
413 		} catch (LexerException e) {
414 			return;
415 		}
416 		assert(false);
417 	}
418 
419 	expect("foo", Tok.Identifier, "foo");
420 	expect("_foo", Tok.Identifier, "_foo");
421 	expect("_fo_o_", Tok.Identifier, "_fo_o_");
422 
423 	expect(`"foo"`, Tok.Literal, `"foo"`);
424 	expect(`'foo'`, Tok.Literal, `'foo'`);
425 
426 	expect("1337", Tok.Literal, "1337");
427 	expect("13_37", Tok.Literal, "13_37");
428 	expect("0xbeef", Tok.Literal, "beef");
429 	expect("1.0", Tok.Literal, "1.0");
430 	expect("1.0f", Tok.Literal, "1.0");
431 	expect(".1", Tok.Literal, ".1");
432 	expect(".1f", Tok.Literal, ".1");
433 	expect("1.", Tok.Literal, "1.");
434 	expect("1.f", Tok.Literal, "1.");
435 	expect("1..", Tok.Literal, "1", false);
436 	expect("1...", Tok.Literal, "1", false);
437 	expect("1[]", Tok.Literal, "1", false);
438 	expect("123[]", Tok.Literal, "123", false);
439 	expect("1.h", Tok.Literal, "1", false);
440 
441 	expect(".", Tok.Char, ".");
442 	expect("(", Tok.Char, "(");
443 	expect("+", Tok.Char, "+");
444 
445 	expectException(`"foo`);
446 	expectException(`'foo`);
447 	expectException(`1__2`);
448 	expectException(`0xabcdefg`);
449 	expectException(`1.0h`);
450 }
451 
452 unittest {
453 	testLexer();
454 }