Creating my own Compiler - Part 2
Extending what was completed in Part 1, we move on from the lexer to the parser.
Main Test Function
The lexer was refactored to bring the I/O portion out of it and the output of the parser is a collection of classes representing the structure.
from generator import Lexer
from generator import Parser
from generator import SyntaxTree
def cli_main():
filename = "D:\\Projects\\lexer\\test1.dmod"
data = None
with open(filename) as f:
data = f.read()
lexer = Lexer(data)
data = SyntaxTree()
parser = Parser(lexer, data)
parser.parse()
print(data)
if __name__ == '__main__':
cli_main()
Model File
Similar to part 1, this is the file being compiled into other format.
enum Enum1 {
Unknown = 0 [display = "Unknown"],
Uninstalled = 1,
Installed = 2,
}
struct Type0 {
bool active [display = "Active"];
}
struct Type1 {
int32_t var1 [display = "Var 1", units = "Hz"];
Type0 var2 [display = "Var 2", group = "Group 1"];
bool[32] array1;
int32_t bitmask [monitor = false];
}
Parser Core Operation
The parser owns the list of keywords and is organized to call a specific handler based on the keyword at the top level.
import re
from .lexer import Lexer
from .lexer import TokenType
from .syntax import *
class Parser:
def __init__(self, lexer, ast_data):
self.lexer = lexer
self.ast_data = ast_data
self.token = None
self.keywords = [
'enum',
'struct'
]
self.subparsers = {
'enum': self.parse_enum,
'struct': self.parse_struct,
}
self.hex_regex = re.compile('^0[xX][0-9a-fA-F]+$')
self.binary_regex = re.compile('^0[bB][0-1]+$')
self.number_regex = re.compile('^\\d+\\.?\\d*$')
self.integer_regex = re.compile('^\\d*$')
def parse(self):
self.lexer.keywords = self.keywords
while True:
token = self.lexer.get_token()
if token:
if token.kind == TokenType.KEYWORD:
if token.text in self.subparsers:
item = self.subparsers[token.text]()
if item:
self.ast_data.add(item)
else:
raise Exception('syntax error')
else:
break
def next_token(self):
self.token = self.lexer.get_token()
def peek_next_token(self):
return self.lexer.peek_token()
Parsing a Struct
The following method handles parsing a top-level structure, where a struct has a unique name and contents within the braces make up the structure.
def parse_struct(self):
item = StructDef()
item.name = self.parse_name_token()
self.expect_token(TokenType.OP_LEFT_BRACE)
while True:
next_token = self.peek_next_token()
if next_token:
if next_token.kind == TokenType.OP_RIGHT_BRACE:
break
else:
break
child = self.parse_struct_item()
if child:
item.add(child)
else:
break
self.expect_token(TokenType.OP_RIGHT_BRACE)
return item
The following method handles parsing a specific structure item by retrieving tokens and evaluating allowed syntax. Specifically, format is data-type, name and other misc attributes, where the data-type may be an array or a type.
def parse_struct_item(self):
item = StructItemDef()
# Data type first
item.data_type = self.parse_name_token()
# Check for optional array specifier
next_token = self.peek_next_token()
if next_token.kind == TokenType.OP_LEFT_BRACKET:
self.next_token()
# this is an array
item.dim = self.parse_integer_token()
self.expect_token(TokenType.OP_RIGHT_BRACKET)
# item identifier
item.name = self.parse_name_token()
# check for optional properties and end of statement
while True:
next_token = self.peek_next_token()
if next_token:
# optional assignment to a value
if next_token.kind == TokenType.OP_LEFT_BRACKET:
self.next_token()
item.properties = self.parse_property_list()
elif next_token.kind == TokenType.OP_END_OF_STATEMENT:
self.next_token()
break
else:
raise Exception('unexpected token')
else:
raise Exception('syntax error: missing token')
return item
Testing
At this point, I simply set a break-point in PyCharm and evaluated the generated data structures.
You may find the current source here: https://github.com/ericjameszimmerman/simplecompiler