Creating my own Compiler - Part 1
Some random weekend research led me to look into how a compiler works and the typical approaches. I have some interest in creating a compiler for some data models in a protobuf like way, but not for messages.
Desired Model File Example
The goal is to largely use c-style syntax, but provide a means to specify additional attributes. I originally thought about json, but was inspired by some of the protobuf alternatives.
enum Enum1 {
Unknown = 0 [display = "Unknown"];
Uninstalled = 1;
Installed = 2;
}
struct Type0 {
bool active [display = "Active"];
}
struct Type1 {
int32_t var1 [display = "Var 1", units = "Hz"];
Type0 var2 [display = "Var 2", group = "Group 1"];
bool[32] array1;
int32_t bitmask [monitor = false];
}
Prototype 1 Lexer
I was looking at some open-source compilers / lexers and some python modules, but decided to do it from scratch with minimal dependencies. I also opted for state machine-based approach rather than trying to use regex too much.
There is likely some optimization and general cleanup that can be done, but seems to be working as expected. In the end, it allows iteration through the available tokens.
import sys
import enum
import re
class Lexer:
STATE_UNKNOWN = 0
STATE_NORMAL = 1
STATE_QUOTED = 2
STATE_ELEMENT = 3
STATE_LINECOMMENT = 4
STATE_BLOCKCOMMENT = 5
def __init__(self):
self.data = None
self.index = 0
self.total_length = 0
self.keywords = [
'enum',
'struct'
]
self.special_characters = [
'=', '[', ']', '{', '}', ';', ','
]
self.operators = {
'=': TokenType.OP_ASSIGN,
'[': TokenType.OP_LEFT_BRACKET,
']': TokenType.OP_RIGHT_BRACKET,
';': TokenType.OP_END_OF_STATEMENT,
'{': TokenType.OP_LEFT_BRACE,
'}': TokenType.OP_RIGHT_BRACE,
',': TokenType.OP_COMMA
}
self.state = self.STATE_UNKNOWN
self.hex_regex = re.compile('^0[xX][0-9a-fA-F]+$')
self.binary_regex = re.compile('^0[bB][0-1]+$')
self.number_regex = re.compile('^\\d+\\.?\\d*$')
def initialize(self, filename):
self.data = None
self.state = self.STATE_UNKNOWN
self.index = 0
self.total_length = 0
with open(filename) as f:
self.data = f.read()
self.total_length = len(self.data)
self.state = self.STATE_NORMAL
def get_next_character(self):
result = None
if self.index < self.total_length:
result = self.data[self.index]
self.index += 1
return result
def push_back_character(self):
if self.index > 0:
self.index -= 1
def peek_next_character(self):
result = None
if self.index < self.total_length:
result = self.data[self.index]
return result
def get_token(self):
result = None
value = ''
while True:
c = self.get_next_character()
if self.state == self.STATE_QUOTED:
if c:
if c == '"':
# end of string; add the token
self.state = self.STATE_NORMAL
result = Token(value, TokenType.CONST_STRING)
break
else:
value += c
else:
# no further characters
raise Exception('syntax error: string missing end quote')
elif self.state == self.STATE_NORMAL:
if c:
next_char = self.peek_next_character()
if c.isspace():
# ignore normal whitespace
pass
elif c == '/' and next_char == '/':
self.state = self.STATE_LINECOMMENT
value = ''
elif c == '/' and next_char == '*':
self.state = self.STATE_BLOCKCOMMENT
value = ''
elif c == '#':
self.state = self.STATE_LINECOMMENT
value = ''
elif c in self.special_characters:
result = self.evaluate_operator_value(c)
self.state = self.STATE_NORMAL
break
else:
value = c
self.state = self.STATE_ELEMENT
else:
# no further characters... we are done
break
elif self.state == self.STATE_ELEMENT:
if c:
if c.isspace():
# end element
self.state = self.STATE_NORMAL
if value:
result = self.evaluate_token_value(value)
break
elif c in self.special_characters:
# end element
self.state = self.STATE_NORMAL
self.push_back_character()
if value:
result = self.evaluate_token_value(value)
break
else:
value += c
else:
# no further characters... end element
result = self.evaluate_token_value(value)
break
elif self.state == self.STATE_LINECOMMENT:
if c:
if c == '\n' :
self.state = self.STATE_NORMAL
value = ''
else:
break
elif self.state == self.STATE_BLOCKCOMMENT:
if c:
next_char = self.peek_next_character()
if c == '*' and next_char == '/':
self.get_next_character()
self.state = self.STATE_NORMAL
value = ''
else:
break
else:
raise Exception('unknown state')
return result
def evaluate_token_value(self, value):
if value:
if value in self.keywords:
result = Token(value, TokenType.KEYWORD)
elif value in self.operators:
result = Token(value, self.operators[value])
else:
if self.is_numeric(value):
result = Token(value, TokenType.CONST_NUMERIC)
else:
result = Token(value, TokenType.VALUE)
else:
raise Exception('invalid token value')
return result
def evaluate_operator_value(self, value):
if value:
if value in self.operators:
result = Token(value, self.operators[value])
else:
raise Exception('invalid operator')
else:
raise Exception('invalid token')
return result
def is_numeric(self, value):
return self.number_regex.match(value) or self.hex_regex.match(value) or self.binary_regex.match(value)
class Token:
def __init__(self, token_text, token_type):
self.text = token_text # The token's actual text. Used for identifiers, strings, and numbers.
self.kind = token_type # The TokenType that this token is classified as.
# TokenType is our enum for all the types of tokens.
class TokenType(enum.Enum):
EOF = -1
KEYWORD = 0
CONST_STRING = 1
CONST_NUMERIC = 2
VALUE = 3
OP_ASSIGN = 100
OP_LEFT_BRACKET = 101
OP_RIGHT_BRACKET = 102
OP_END_OF_STATEMENT = 103
OP_LEFT_BRACE = 104,
OP_RIGHT_BRACE = 105
OP_COMMA = 106
Test Application
from generator import Lexer
def cli_main():
lexer = Lexer()
lexer.initialize("D:\\Projects\\lexer\\test1.dmod")
while True:
token = lexer.get_token()
if token:
print(f'text = "{token.text}", kind = "{token.kind}"')
else:
break
if __name__ == '__main__':
cli_main()
Text Output
text = "enum", kind = "TokenType.KEYWORD"
text = "Enum1", kind = "TokenType.VALUE"
text = "{", kind = "TokenType.OP_LEFT_BRACE"
text = "Unknown", kind = "TokenType.VALUE"
text = "=", kind = "TokenType.OP_ASSIGN"
text = "0", kind = "TokenType.CONST_NUMERIC"
text = "[", kind = "TokenType.OP_LEFT_BRACKET"
text = "display", kind = "TokenType.VALUE"
text = "=", kind = "TokenType.OP_ASSIGN"
text = ""Unknown"", kind = "TokenType.VALUE"
text = "]", kind = "TokenType.OP_RIGHT_BRACKET"
text = ";", kind = "TokenType.OP_END_OF_STATEMENT"
text = "Uninstalled", kind = "TokenType.VALUE"
text = "=", kind = "TokenType.OP_ASSIGN"
text = "1", kind = "TokenType.CONST_NUMERIC"
text = ";", kind = "TokenType.OP_END_OF_STATEMENT"
text = "Installed", kind = "TokenType.VALUE"
text = "=", kind = "TokenType.OP_ASSIGN"
text = "2", kind = "TokenType.CONST_NUMERIC"
text = ";", kind = "TokenType.OP_END_OF_STATEMENT"
text = "}", kind = "TokenType.OP_RIGHT_BRACE"
text = "struct", kind = "TokenType.KEYWORD"
text = "Type0", kind = "TokenType.VALUE"
text = "{", kind = "TokenType.OP_LEFT_BRACE"
text = "bool", kind = "TokenType.VALUE"
text = "active", kind = "TokenType.VALUE"
text = "[", kind = "TokenType.OP_LEFT_BRACKET"
text = "display", kind = "TokenType.VALUE"
text = "=", kind = "TokenType.OP_ASSIGN"
text = ""Active"", kind = "TokenType.VALUE"
text = "]", kind = "TokenType.OP_RIGHT_BRACKET"
text = ";", kind = "TokenType.OP_END_OF_STATEMENT"
text = "}", kind = "TokenType.OP_RIGHT_BRACE"
text = "struct", kind = "TokenType.KEYWORD"
text = "Type1", kind = "TokenType.VALUE"
text = "{", kind = "TokenType.OP_LEFT_BRACE"
text = "int32_t", kind = "TokenType.VALUE"
text = "var1", kind = "TokenType.VALUE"
text = "[", kind = "TokenType.OP_LEFT_BRACKET"
text = "display", kind = "TokenType.VALUE"
text = "=", kind = "TokenType.OP_ASSIGN"
text = ""Var", kind = "TokenType.VALUE"
text = "1"", kind = "TokenType.VALUE"
text = ",", kind = "TokenType.OP_COMMA"
text = "units", kind = "TokenType.VALUE"
text = "=", kind = "TokenType.OP_ASSIGN"
text = ""Hz"", kind = "TokenType.VALUE"
text = "]", kind = "TokenType.OP_RIGHT_BRACKET"
text = ";", kind = "TokenType.OP_END_OF_STATEMENT"
text = "Type0", kind = "TokenType.VALUE"
text = "var2", kind = "TokenType.VALUE"
text = "[", kind = "TokenType.OP_LEFT_BRACKET"
text = "display", kind = "TokenType.VALUE"
text = "=", kind = "TokenType.OP_ASSIGN"
text = ""Var", kind = "TokenType.VALUE"
text = "2"", kind = "TokenType.VALUE"
text = ",", kind = "TokenType.OP_COMMA"
text = "group", kind = "TokenType.VALUE"
text = "=", kind = "TokenType.OP_ASSIGN"
text = ""Group", kind = "TokenType.VALUE"
text = "1"", kind = "TokenType.VALUE"
text = "]", kind = "TokenType.OP_RIGHT_BRACKET"
text = ";", kind = "TokenType.OP_END_OF_STATEMENT"
text = "bool", kind = "TokenType.VALUE"
text = "[", kind = "TokenType.OP_LEFT_BRACKET"
text = "32", kind = "TokenType.CONST_NUMERIC"
text = "]", kind = "TokenType.OP_RIGHT_BRACKET"
text = "array1", kind = "TokenType.VALUE"
text = ";", kind = "TokenType.OP_END_OF_STATEMENT"
text = "int32_t", kind = "TokenType.VALUE"
text = "bitmask", kind = "TokenType.VALUE"
text = "[", kind = "TokenType.OP_LEFT_BRACKET"
text = "monitor", kind = "TokenType.VALUE"
text = "=", kind = "TokenType.OP_ASSIGN"
text = "false", kind = "TokenType.VALUE"
text = "]", kind = "TokenType.OP_RIGHT_BRACKET"
text = ";", kind = "TokenType.OP_END_OF_STATEMENT"
text = "}", kind = "TokenType.OP_RIGHT_BRACE"
Process finished with exit code 0
All the code may be found here: https://github.com/ericjameszimmerman/simplecompiler