mutils.serializer.lexer_utils source code

1 module mutils.serializer.lexer_utils;
2 
3 import std.algorithm : canFind;
4 import std.meta;
5 import std.traits;
6 
7 import mutils.container.vector;
8 import mutils.conv;
9 
10 void updateLineAndCol(ref uint line, ref uint column, string oldSlice, string newSLice) {
11 	foreach (char ch; oldSlice[0 .. oldSlice.length - newSLice.length]) {
12 		if (ch == '\n') {
13 			line++;
14 			column = 0;
15 		} else {
16 			column++;
17 		}
18 	}
19 }
20 
21 void serializeWhiteTokens(bool load, Container)(ref TokenData token, ref Container con) {
22 	static if (load == true) {
23 		size_t whiteNum = 0;
24 		foreach (ch; con) {
25 			if (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r') {
26 				whiteNum++;
27 			} else {
28 				break;
29 			}
30 		}
31 		if (whiteNum > 0) {
32 			token.str = con[0 .. whiteNum];
33 			con = con[whiteNum .. $];
34 			token.type = StandardTokens.white;
35 			return;
36 		}
37 		token.type = StandardTokens.notoken;
38 	} else {
39 		if (token.type == StandardTokens.white) {
40 			con ~= cast(char[]) token.str;
41 		}
42 	}
43 }
44 
45 void serializeCommentMultiline(bool load, Container)(ref TokenData token, ref Container con) {
46 	static if (load == true) {
47 		char[2] slst = ['/', '*'];
48 		assert(con[0 .. 2] == slst[]);
49 		con = con[2 .. $];
50 		foreach (i, ch; con) {
51 			if (ch == '*' && i != con.length - 1 && con[i + 1] == '/') {
52 				token.str = con[0 .. i];
53 				con = con[i + 2 .. $];
54 				token.type = StandardTokens.comment_multiline;
55 				return;
56 			}
57 		}
58 		token.str = con;
59 		con = null;
60 		token.type = StandardTokens.comment_multiline;
61 	} else {
62 		if (token.type == StandardTokens.comment_multiline) {
63 			con ~= cast(char[]) "/*";
64 			con ~= cast(char[]) token.str;
65 			con ~= cast(char[]) "*/";
66 		}
67 	}
68 }
69 
70 unittest {
71 	string str = "/*  aaa bbb ccc */";
72 	TokenData tk;
73 	serializeCommentMultiline!(true)(tk, str);
74 	assert(tk.str == "  aaa bbb ccc ");
75 }
76 
77 void serializeCommentLine(bool load, Container)(ref TokenData token, ref Container con) {
78 	static if (load == true) {
79 		char[2] slsl = ['/', '/'];
80 		assert(con[0 .. 2] == slsl[]);
81 		con = con[2 .. $];
82 		foreach (i, ch; con) {
83 			if (ch == '\n') {
84 				token.str = con[0 .. i - 1];
85 				con = con[i .. $];
86 				token.type = StandardTokens.comment_line;
87 				return;
88 			}
89 		}
90 		token.str = con;
91 		con = null;
92 		token.type = StandardTokens.comment_line;
93 	} else {
94 		if (token.type == StandardTokens.comment_line) {
95 			con ~= cast(char[]) "//";
96 			con ~= cast(char[]) token.str;
97 		}
98 	}
99 }
100 
101 bool isIdentifierFirstChar(char ch) {
102 	return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || ch == '_';
103 }
104 
105 void serializeIdentifier(bool load, Container)(ref TokenData token, ref Container con) {
106 	static if (load == true) {
107 		size_t charactersNum = 0;
108 		char fch = con[0];
109 		if (isIdentifierFirstChar(con[0])) {
110 			charactersNum++;
111 		} else {
112 			token.type = StandardTokens.notoken;
113 			return;
114 		}
115 		foreach (ch; con[1 .. $]) {
116 			if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')
117 					|| (ch >= '0' && ch <= '9') || ch == '_') {
118 				charactersNum++;
119 			} else {
120 				break;
121 			}
122 		}
123 		if (charactersNum > 0) {
124 			token.str = con[0 .. charactersNum];
125 			con = con[charactersNum .. $];
126 			token.type = StandardTokens.identifier;
127 			return;
128 
129 		}
130 		token.type = StandardTokens.notoken;
131 	} else {
132 		if (token.type == StandardTokens.white) {
133 			con ~= token.str;
134 		}
135 	}
136 }
137 
138 void serializeStringToken(bool load, Container)(ref TokenData token, ref Container con) {
139 	static if (load == true) {
140 		token.type = StandardTokens.string_;
141 		assert(con[0]=='"');
142 		if(con.length<2){
143 			token.str = null;
144 			con = null;
145 			return;
146 		}
147 		if (con[0..2] == `""`) {
148 			token.str = null;
149 			con = con[2 .. $];
150 			return;
151 		}
152 		//char fch = con[0];
153 		con = con[1 .. $];
154 		size_t end = 0;
155 		bool ignoreNext=false;
156 		foreach (i; 1..con.length) {
157 			if (ignoreNext) {
158 				ignoreNext = false;
159 				continue;
160 			}
161 			if (con[i] == '\\') {
162 				ignoreNext=true;
163 				continue;
164 			}
165 			if (con[i] == '"') {
166 				end=i;
167 				break;
168 			}
169 		}
170 		token.str = con[0 .. end];
171 		con = con[end+1 .. $];
172 	} else {
173 		if (token.type == StandardTokens.string_) {
174 			con ~= token.str;
175 		}
176 	}
177 }
178 
179 /// Returns string is valid only to next call to any mutils.conv function
180 string doubleToString(double num) {
181 	return num.to!string;
182 }
183 
184 /// Returns string is valid only to next call to any mutils.conv function
185 string longToString(long num) {
186 	return num.to!string;
187 }
188 
189 long stringToLong(string str) {
190 
191 	enum long[19] table = [
192 			1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000, 10000000000, 100000000000,
193 			1000000000000, 10000000000000, 100000000000000, 1000000000000000,
194 			10000000000000000, 100000000000000000, 1000000000000000000,
195 		];
196 	assert(str.length < 20);
197 	if (str.length == 0) {
198 		return 0;
199 	}
200 	bool minus;
201 	if (str.ptr[0] == '-') {
202 		minus = true;
203 		str = str[1 .. $];
204 	}
205 	long num;
206 	int i;
207 	foreach_reverse (c; str) {
208 		num += (c - '0') * table[i];
209 		i++;
210 	}
211 
212 	if (minus) {
213 		num *= -1;
214 	}
215 	return num;
216 }
217 
218 void serializeNumberToken(bool load, Container)(ref TokenData token, ref Container con) {
219 	static if (load == true) {
220 		bool minus = false;
221 		string firstPart;
222 		string secondPart;
223 		if (con[0] == '-') {
224 			minus = true;
225 			con = con[1 .. $];
226 		}
227 		foreach (i, ch; con) {
228 			if (ch >= '0' && ch <= '9') {
229 				firstPart = con[0 .. i + 1];
230 			} else {
231 				break;
232 			}
233 		}
234 		con = con[firstPart.length .. $];
235 		if (con.length > 0 && con[0] == '.') {
236 			con = con[1 .. $];
237 			foreach (i, ch; con) {
238 				if (ch >= '0' && ch <= '9') {
239 					secondPart = con[0 .. i + 1];
240 				} else {
241 					break;
242 				}
243 			}
244 			con = con[secondPart.length .. $];
245 			if (con[0] == 'f') {
246 				con = con[1 .. $];
247 			}
248 			double num = stringToLong(firstPart) + cast(double) stringToLong(
249 					secondPart) / (10 ^^ secondPart.length);
250 			token.double_ = minus ? -num : num;
251 			token.type = StandardTokens.double_;
252 		} else {
253 			long num = stringToLong(firstPart);
254 			token.long_ = minus ? -num : num;
255 			token.type = StandardTokens.long_;
256 		}
257 	} else {
258 		if (token.type == StandardTokens.double_) {
259 			con ~= cast(char[]) doubleToString(token.double_);
260 		} else if (token.type == StandardTokens.long_) {
261 			con ~= cast(char[]) longToString(token.long_);
262 		} else {
263 			assert(0);
264 		}
265 	}
266 }
267 
268 alias whiteTokens = AliasSeq!('\n', '\t', '\r', ' ');
269 
270 enum StandardTokens {
271 	notoken = 0,
272 	white = 1,
273 	character = 2,
274 	identifier = 3,
275 	string_ = 4,
276 	double_ = 5,
277 	long_ = 6,
278 	comment_multiline = 7,
279 	comment_line = 8,
280 }
281 
282 struct TokenData {
283 	union {
284 		string str;
285 		char ch;
286 		long long_;
287 		double double_;
288 	}
289 
290 	uint line;
291 	uint column;
292 	uint type = StandardTokens.notoken;
293 
294 	char getChar() {
295 		assert(type == StandardTokens.character);
296 		return ch;
297 	}
298 
299 	string getUnescapedString() {
300 		assert(type == StandardTokens.string_);
301 
302 		bool hasEscapeChar = false;
303 		foreach (ch; str) {
304 			if (ch == '\\') {
305 				hasEscapeChar = true;
306 				break;
307 			}
308 		}
309 
310 		if (!hasEscapeChar) {
311 			return str;
312 		}
313 		string copy;
314 		copy.reserve(str.length);
315 		bool ignoreNext = false;
316 		foreach (i, ch; str) {
317 			if (ignoreNext) {
318 				ignoreNext = false;
319 				continue;
320 			}
321 			if (ch == '\\' && i != str.length - 1) {
322 				char nextCh = str[i + 1];
323 				ignoreNext = true;
324 				if (nextCh == 'n') {
325 					copy ~= '\n';
326 				} else if (nextCh == 't') {
327 					copy ~= '\t';
328 				} else if (nextCh == '\\') {
329 					copy ~= '\\';
330 				} else if (nextCh == '/') {
331 					copy ~= '/';
332 				}else if (nextCh == '"') {
333 					copy ~= '"';
334 				}
335 			}
336 			if (ch == '\\') {
337 				continue;
338 			}
339 			copy ~= ch;
340 		}
341 		return copy;
342 
343 	}
344 
345 	string getEscapedString() {
346 		return str;
347 	}
348 
349 	bool isChar(char ch) {
350 		return type == StandardTokens.character && this.ch == ch;
351 	}
352 
353 	bool isString(string ss) {
354 		return (type == StandardTokens.comment_line || type == StandardTokens.comment_multiline
355 				|| type == StandardTokens.identifier
356 				|| type == StandardTokens.string_ || type == StandardTokens.white) && str == ss;
357 	}
358 
359 	bool isComment() {
360 		return type == StandardTokens.comment_line || type == StandardTokens.comment_multiline;
361 	}
362 
363 	void opAssign(T)(T el)
364 			if (isIntegral!T || isFloatingPoint!T || is(T == string)
365 				|| is(Unqual!T == char) || is(T == bool)) {
366 		alias TP = Unqual!T;
367 		static if (isIntegral!TP || is(T == bool)) {
368 			type = StandardTokens.long_;
369 			this.long_ = el;
370 		} else static if (isFloatingPoint!TP) {
371 			type = StandardTokens.double_;
372 			this.double_ = el;
373 		} else static if (is(TP == string)) {
374 			type = StandardTokens.string_;
375 			this.str = el;
376 		} else static if (is(TP == char)) {
377 			type = StandardTokens.character;
378 			this.ch = el;
379 		} else {
380 			static assert(0);
381 		}
382 	}
383 
384 	bool isAssignableTo(T)()
385 			if (isIntegral!T || isFloatingPoint!T || is(T == string) || is(T == char)
386 				|| is(T == bool)) {
387 		static if (isIntegral!T || is(T == bool)) {
388 			return type == StandardTokens.long_;
389 		} else static if (isFloatingPoint!T) {
390 			return type == StandardTokens.double_ || type == StandardTokens.long_;
391 		} else static if (is(T == string)) {
392 			return type == StandardTokens.string_;
393 		} else static if (is(T == char)) {
394 			return type == StandardTokens.character;
395 		} else {
396 			static assert(0);
397 		}
398 	}
399 
400 	bool isType(T)()
401 			if (isIntegral!T || isFloatingPoint!T || is(T == string) || is(T == char)
402 				|| is(T == bool)) {
403 		static if (isIntegral!T || is(T == bool)) {
404 			return type == StandardTokens.long_;
405 		} else static if (isFloatingPoint!T) {
406 			return type == StandardTokens.double_;
407 		} else static if (is(T == string)) {
408 			return type == StandardTokens.string_;
409 		} else static if (is(T == char)) {
410 			return type == StandardTokens.character;
411 		} else {
412 			static assert(0);
413 		}
414 	}
415 
416 	auto get(T)()
417 			if (isIntegral!T || isFloatingPoint!T || is(T == string) || is(T == char)
418 				|| is(T == bool)) {
419 		static if (isIntegral!T || is(T == bool)) {
420 			assert(type == StandardTokens.long_);
421 			return cast(T) long_;
422 		} else static if (isFloatingPoint!T) {
423 			assert(type == StandardTokens.double_ || type == StandardTokens.long_);
424 			if (type == StandardTokens.double_) {
425 				return cast(T) double_;
426 			} else if (type == StandardTokens.long_) {
427 				return cast(T) long_;
428 			}
429 			return T.init; // For release build stability
430 		} else static if (is(T == string)) {
431 			assert(type == StandardTokens.string_);
432 			return cast(T) str;
433 		} else static if (is(T == char)) {
434 			assert(type == StandardTokens.character);
435 			return cast(T) ch;
436 		} else {
437 			static assert(0);
438 		}
439 	}
440 
441 	string toString() {
442 		import std.format;
443 
444 		switch (type) {
445 		case StandardTokens.character:
446 			return format("TK(%5s, '%s', %s, %s)",
447 					cast(StandardTokens) type, ch, line, column);
448 		case StandardTokens.string_:
449 		case StandardTokens.identifier:
450 		case StandardTokens.white:
451 		case StandardTokens.comment_line:
452 		case StandardTokens.comment_multiline:
453 			return format("TK(%5s, \"%s\", %s, %s)",
454 					cast(StandardTokens) type, str, line, column);
455 		case StandardTokens.double_:
456 			return format("TK(%5s, %s, %s, %s)",
457 					cast(StandardTokens) type, double_, line, column);
458 		case StandardTokens.long_:
459 			return format("TK(%5s, %s, %s, %s)",
460 					cast(StandardTokens) type, long_, line, column);
461 		default:
462 			return format("TK(%5s, ???, %s, %s)", cast(StandardTokens) type, line, column);
463 
464 		}
465 	}
466 }
467 
468 alias TokenDataVector = Vector!(TokenData);
469 
470 void printAllTokens(Lexer)(ref Lexer lex) {
471 	TokenData token;
472 	while (token.type != Token.notoken) {
473 		token = lex.getNextToken();
474 		writeln(token);
475 	}
476 }
477 
478 TokenDataVector tokenizeAll(Lexer)(ref Lexer lex) {
479 	TokenDataVector tokens;
480 	do {
481 		tokens ~= lex.getNextToken();
482 	}
483 	while (tokens[$ - 1].type != StandardTokens.notoken);
484 
485 	return tokens;
486 }
487 
488 Vector!char tokensToString(Lexer)(ref Lexer lex, TokenData[] tokens) {
489 	Vector!char code;
490 	foreach (tk; tokens)
491 		lex.toChars(tk, code);
492 	return code;
493 }