Creating my own Compiler - Part 2

Extending what was completed in Part 1, we move on from the lexer to the parser.

Main Test Function

The lexer was refactored to bring the I/O portion out of it and the output of the parser is a collection of classes representing the structure.

from generator import Lexer
from generator import Parser
from generator import SyntaxTree

def cli_main():
    filename = "D:\\Projects\\lexer\\test1.dmod"
    data = None

    with open(filename) as f:
        data = f.read()

    lexer = Lexer(data)
    data = SyntaxTree()
    parser = Parser(lexer, data)
    parser.parse()
    print(data)


if __name__ == '__main__':
    cli_main()

Model File

Similar to part 1, this is the file being compiled into other format.

enum Enum1 {
  Unknown = 0 [display = "Unknown"],
  Uninstalled = 1,
  Installed = 2,
}

struct Type0 {
  bool active [display = "Active"];
}

struct Type1 {
  int32_t var1 [display = "Var 1", units = "Hz"];
  Type0 var2 [display = "Var 2", group = "Group 1"];
  bool[32] array1;
  int32_t bitmask [monitor = false];
}

Parser Core Operation

The parser owns the list of keywords and is organized to call a specific handler based on the keyword at the top level.

import re
from .lexer import Lexer
from .lexer import TokenType
from .syntax import *

class Parser:
    def __init__(self, lexer, ast_data):
        self.lexer = lexer
        self.ast_data = ast_data
        self.token = None
        self.keywords = [
            'enum',
            'struct'
        ]

        self.subparsers = {
            'enum': self.parse_enum,
            'struct': self.parse_struct,
        }

        self.hex_regex = re.compile('^0[xX][0-9a-fA-F]+$')
        self.binary_regex = re.compile('^0[bB][0-1]+$')
        self.number_regex = re.compile('^\\d+\\.?\\d*$')
        self.integer_regex = re.compile('^\\d*$')

    def parse(self):
        self.lexer.keywords = self.keywords

        while True:
            token = self.lexer.get_token()
            if token:
                if token.kind == TokenType.KEYWORD:
                    if token.text in self.subparsers:
                        item = self.subparsers[token.text]()
                        if item:
                            self.ast_data.add(item)
                else:
                    raise Exception('syntax error')
            else:
                break

    def next_token(self):
        self.token = self.lexer.get_token()

    def peek_next_token(self):
        return self.lexer.peek_token()

Parsing a Struct

The following method handles parsing a top-level structure, where a struct has a unique name and contents within the braces make up the structure.

    def parse_struct(self):
        item = StructDef()
        item.name = self.parse_name_token()
        self.expect_token(TokenType.OP_LEFT_BRACE)

        while True:
            next_token = self.peek_next_token()
            if next_token:
                if next_token.kind == TokenType.OP_RIGHT_BRACE:
                    break
            else:
                break

            child = self.parse_struct_item()
            if child:
                item.add(child)
            else:
                break

        self.expect_token(TokenType.OP_RIGHT_BRACE)
        return item

The following method handles parsing a specific structure item by retrieving tokens and evaluating allowed syntax. Specifically, format is data-type, name and other misc attributes, where the data-type may be an array or a type.

    def parse_struct_item(self):
        item = StructItemDef()

        # Data type first
        item.data_type = self.parse_name_token()

        # Check for optional array specifier

        next_token = self.peek_next_token()
        if next_token.kind == TokenType.OP_LEFT_BRACKET:
            self.next_token()
            # this is an array
            item.dim = self.parse_integer_token()
            self.expect_token(TokenType.OP_RIGHT_BRACKET)

        # item identifier
        item.name = self.parse_name_token()

        # check for optional properties and end of statement
        while True:
            next_token = self.peek_next_token()

            if next_token:
                # optional assignment to a value
                if next_token.kind == TokenType.OP_LEFT_BRACKET:
                    self.next_token()
                    item.properties = self.parse_property_list()
                elif next_token.kind == TokenType.OP_END_OF_STATEMENT:
                    self.next_token()
                    break
                else:
                    raise Exception('unexpected token')
            else:
                raise Exception('syntax error: missing token')

        return item

Testing

At this point, I simply set a break-point in PyCharm and evaluated the generated data structures.

You may find the current source here: https://github.com/ericjameszimmerman/simplecompiler