Let's write a compiler
~25 mins read
Both an interpreter and a compiler translate one language to another, normally a higher-level one to a lower-level one.
In the end, every program must be compiled down to the assembly, CPU instructions, and finally electric pulses on transistors of logic gates
In the An interpreter in Python post, we built a mini interpreter, visiting the nodes and executing them, the flow looked like
Source Code -> TOKENIZER -> tokens -> PARSER -> Abstract Syntax Tree -> EVALUATOR -> Executes in another language like C -> COMPILER -> machine code
For a compiler, it would look like
Source Code -> TOKENIZER -> tokens -> PARSER -> Abstract Syntax Tree -> COMPILER -> byte code -> VM -> COMPILER for the VM language -> Assembler -> machine code
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
from enum import Enum, auto
from dataclasses import dataclass
# Enum to represent different token types
class TokenType(Enum):
ILLEGAL = "ILLEGAL"
EOF = "EOF"
# Identifiers + literals
IDENT = "IDENT" # add, foobar, x, y, ...
INT = "INT" # 1343456
STRING = "STRING" # "foobar"
# Operators
ASSIGN = "="
PLUS = "+"
MINUS = "-"
BANG = "!"
ASTERISK = "*"
SLASH = "/"
LT = "<"
GT = ">"
EQ = "=="
NOT_EQ = "!="
# Delimiters
COMMA = ","
SEMICOLON = ";"
COLON = ":"
LPAREN = "("
RPAREN = ")"
LBRACE = "{"
RBRACE = "}"
LBRACKET = "["
RBRACKET = "]"
# Keywords
FUNCTION = "FUNCTION"
LET = "LET"
TRUE = "TRUE"
FALSE = "FALSE"
IF = "IF"
ELSE = "ELSE"
RETURN = "RETURN"
@dataclass
class Token:
Type: TokenType
Literal: str
keywords = {
"fn": TokenType.FUNCTION,
"let": TokenType.LET,
"true": TokenType.TRUE,
"false": TokenType.FALSE,
"if": TokenType.IF,
"else": TokenType.ELSE,
"return": TokenType.RETURN,
}
single_char_tokens = {
'=': TokenType.ASSIGN,
'+': TokenType.PLUS,
'-': TokenType.MINUS,
'!': TokenType.BANG,
'/': TokenType.SLASH,
'*': TokenType.ASTERISK,
'<': TokenType.LT,
'>': TokenType.GT,
';': TokenType.SEMICOLON,
':': TokenType.COLON,
',': TokenType.COMMA,
'{': TokenType.LBRACE,
'}': TokenType.RBRACE,
'(': TokenType.LPAREN,
')': TokenType.RPAREN,
}
def lookup_ident(ident: str) -> TokenType:
return keywords.get(ident, TokenType.IDENT)
Now we have the tokens, let’s define what we expect from the tokenizer
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
from tokenizer import Tokenizer
from token import TokenType
def test_next_token():
input_text = """
let five = 5;
let ten = 10;
let add = fn(x, y) {
x + y;
};
let result = add(five, ten);
!-/*5;
5 < 10 > 5;
if (5 < 10) {
return true;
} else {
return false;
}
10 == 10;
10 != 9;
"foobar"
"foo bar"
[1, 2];
{"foo": "bar"}
"""
tests = [
(TokenType.LET, "let"),
(TokenType.IDENT, "five"),
(TokenType.ASSIGN, "="),
(TokenType.INT, "5"),
(TokenType.SEMICOLON, ";"),
(TokenType.LET, "let"),
(TokenType.IDENT, "ten"),
(TokenType.ASSIGN, "="),
(TokenType.INT, "10"),
(TokenType.SEMICOLON, ";"),
(TokenType.LET, "let"),
(TokenType.IDENT, "add"),
(TokenType.ASSIGN, "="),
(TokenType.FUNCTION, "fn"),
(TokenType.LPAREN, "("),
(TokenType.IDENT, "x"),
(TokenType.COMMA, ","),
(TokenType.IDENT, "y"),
(TokenType.RPAREN, ")"),
(TokenType.LBRACE, "{"),
(TokenType.IDENT, "x"),
(TokenType.PLUS, "+"),
(TokenType.IDENT, "y"),
(TokenType.SEMICOLON, ";"),
(TokenType.RBRACE, "}"),
(TokenType.SEMICOLON, ";"),
(TokenType.LET, "let"),
(TokenType.IDENT, "result"),
(TokenType.ASSIGN, "="),
(TokenType.IDENT, "add"),
(TokenType.LPAREN, "("),
(TokenType.IDENT, "five"),
(TokenType.COMMA, ","),
(TokenType.IDENT, "ten"),
(TokenType.RPAREN, ")"),
(TokenType.SEMICOLON, ";"),
(TokenType.BANG, "!"),
(TokenType.MINUS, "-"),
(TokenType.SLASH, "/"),
(TokenType.ASTERISK, "*"),
(TokenType.INT, "5"),
(TokenType.SEMICOLON, ";"),
(TokenType.INT, "5"),
(TokenType.LT, "<"),
(TokenType.INT, "10"),
(TokenType.GT, ">"),
(TokenType.INT, "5"),
(TokenType.SEMICOLON, ";"),
(TokenType.IF, "if"),
(TokenType.LPAREN, "("),
(TokenType.INT, "5"),
(TokenType.LT, "<"),
(TokenType.INT, "10"),
(TokenType.RPAREN, ")"),
(TokenType.LBRACE, "{"),
(TokenType.RETURN, "return"),
(TokenType.TRUE, "true"),
(TokenType.SEMICOLON, ";"),
(TokenType.RBRACE, "}"),
(TokenType.ELSE, "else"),
(TokenType.LBRACE, "{"),
(TokenType.RETURN, "return"),
(TokenType.FALSE, "false"),
(TokenType.SEMICOLON, ";"),
(TokenType.RBRACE, "}"),
(TokenType.INT, "10"),
(TokenType.EQ, "=="),
(TokenType.INT, "10"),
(TokenType.SEMICOLON, ";"),
(TokenType.INT, "10"),
(TokenType.NOT_EQ, "!="),
(TokenType.INT, "9"),
(TokenType.SEMICOLON, ";"),
(TokenType.STRING, "foobar"),
(TokenType.STRING, "foo bar"),
(TokenType.LBRACKET, "["),
(TokenType.INT, "1"),
(TokenType.COMMA, ","),
(TokenType.INT, "2"),
(TokenType.RBRACKET, "]"),
(TokenType.SEMICOLON, ";"),
(TokenType.LBRACE, "{"),
(TokenType.STRING, "foo"),
(TokenType.COLON, ":"),
(TokenType.STRING, "bar"),
(TokenType.RBRACE, "}"),
(TokenType.EOF, ""),
]
tokenizer = Tokenizer(input_text)
for i, (expected_type, expected_literal) in enumerate(tests):
tok = tokenizer.next_token()
assert tok.type == expected_type, f"tests[{i}] - tokentype wrong. expected={expected_type}, got={tok.type}"
assert tok.literal == expected_literal, f"tests[{i}] - literal wrong. expected={expected_literal}, got={tok.literal}"
print("Tokenizer tests passed!")
if __name__ == "__main__":
test_next_token()
Now let’s implement the tokenizer
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
from token import Token, TokenType, single_char_tokens
from typing import Optional
class Tokenizer:
def __init__(self, input: str):
self.input = input
self.position = 0
self.read_position = 0
self.ch: Optional[str] = None
self.read_char()
def __post_init__(self):
self.read_char()
def read_char(self):
if self.read_position >= len(self.input):
self.ch = None
else:
self.ch = self.input[self.read_position]
self.position = self.read_position
self.read_position += 1
def peek_char(self):
if self.read_position >= len(self.input):
return None
else:
return self.input[self.read_position]
def read_identifier(self):
position = self.position
while self.ch is not None and (self.ch.isalpha() or self.ch == '_'):
self.read_char()
return self.input[position:self.position]
def read_number(self):
position = self.position
while self.ch is not None and self.ch.isdigit():
self.read_char()
return self.input[position:self.position]
def read_string(self):
position = self.position + 1
while self.ch != '"' and self.ch is not None:
self.read_char()
return self.input[position:self.position]
def skip_whitespace(self):
while self.ch is not None and self.ch.isspace():
self.read_char()
def next_token(self):
self.skip_whitespace()
if self.ch is None:
return Token(TokenType.EOF, "")
if self.ch in single_char_tokens:
tok_type = single_char_tokens[self.ch]
tok_literal = self.ch
self.read_char()
return Token(tok_type, tok_literal)
if self.ch == '"':
tok_type = TokenType.STRING
tok_literal = self.read_string()
return Token(tok_type, tok_literal)
if self.ch.isdigit():
tok_type = TokenType.INT
tok_literal = self.read_number()
return Token(tok_type, tok_literal)
if self.ch.isalpha() or self.ch == '_':
tok_literal = self.read_identifier()
tok_type = TokenType(t)
return Token(tok_type, tok_literal)
return Token(TokenType.ILLEGAL, self.ch)
Now let’s define the AST,
a program is a list of statements
each statement is a node in AST
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
from token import Token, TokenType
from typing import List, Optional
from dataclasses import dataclass
# The base Node interface
@dataclass
class Node:
def token_literal(self) -> str:
raise NotImplementedError
def __str__(self) -> str:
raise NotImplementedError
# All statement nodes implement this
@dataclass
class Statement(Node):
def token_literal(self) -> str:
raise NotImplementedError
# All expression nodes implement this
@dataclass
class Expression(Node):
def token_literal(self) -> str:
raise NotImplementedError
@dataclass
class Program(Node):
statements: List[Statement]
def token_literal(self) -> str:
if self.statements:
return self.statements[0].token_literal()
return ""
def __str__(self) -> str:
return "".join(str(stmt) for stmt in self.statements)
@dataclass
class LetStatement(Statement):
token: Token
name: Expression
value: Expression
def token_literal(self) -> str:
return self.token.literal
def __str__(self) -> str:
return f"{self.token_literal()} {self.name} = {self.value};"
@dataclass
class ReturnStatement(Statement):
token: Token
return_value: Optional[Expression]
def token_literal(self) -> str:
return self.token.literal
def __str__(self) -> str:
if self.return_value is not None:
return f"{self.token_literal()} {self.return_value};"
return f"{self.token_literal()};"
@dataclass
class ExpressionStatement(Statement):
token: Token
expression: Optional[Expression]
def token_literal(self) -> str:
return self.token.literal
def __str__(self) -> str:
if self.expression is not None:
return str(self.expression)
return ""
@dataclass
class BlockStatement(Statement):
token: Token
statements: List[Statement]
def token_literal(self) -> str:
return self.token.literal
def __str__(self) -> str:
return "".join(str(stmt) for stmt in self.statements)
@dataclass
class Identifier(Expression):
token: Token
value: str
def token_literal(self) -> str:
return self.token.literal
def __str__(self) -> str:
return self.value
@dataclass
class Boolean(Expression):
token: Token
value: bool
def token_literal(self) -> str:
return self.token.literal
def __str__(self) -> str:
return self.token.literal
@dataclass
class IntegerLiteral(Expression):
token: Token
value: int
def token_literal(self) -> str:
return self.token.literal
def __str__(self) -> str:
return self.token.literal
@dataclass
class PrefixExpression(Expression):
token: Token
operator: str
right: Expression
def token_literal(self) -> str:
return self.token.literal
def __str__(self) -> str:
return f"({self.operator}{self.right})"
@dataclass
class InfixExpression(Expression):
token: Token
left: Expression
operator: str
right: Expression
def token_literal(self) -> str:
return self.token.literal
def __str__(self) -> str:
return f"({self.left} {self.operator} {self.right})"
@dataclass
class IfExpression(Expression):
token: Token
condition: Expression
consequence: BlockStatement
alternative: Optional[BlockStatement]
def token_literal(self) -> str:
return self.token.literal
def __str__(self) -> str:
result = f"if {self.condition} {self.consequence}"
if self.alternative:
result += f" else {self.alternative}"
return result
@dataclass
class FunctionLiteral(Expression):
token: Token
parameters: List[Identifier]
body: BlockStatement
def token_literal(self) -> str:
return self.token.literal
def __str__(self) -> str:
params = ", ".join(str(param) for param in self.parameters)
return f"{self.token_literal()}({params}) {self.body}"
@dataclass
class CallExpression(Expression):
token: Token
function: Expression
arguments: List[Expression]
def token_literal(self) -> str:
return self.token.literal
def __str__(self) -> str:
args = ", ".join(str(arg) for arg in self.arguments)
return f"{self.function}({args})"
@dataclass
class StringLiteral(Expression):
token: Token
value: str
def token_literal(self) -> str:
return self.token.literal
def __str__(self) -> str:
return self.token.literal
@dataclass
class ArrayLiteral(Expression):
token: Token
elements: List[Expression]
def token_literal(self) -> str:
return self.token.literal
def __str__(self) -> str:
elements = ", ".join(str(elem) for elem in self.elements)
return f"[{elements}]"
@dataclass
class IndexExpression(Expression):
token: Token
left: Expression
index: Expression
def token_literal(self) -> str:
return self.token.literal
def __str__(self) -> str:
return f"({self.left}[{self.index}])"
@dataclass
class HashLiteral(Expression):
token: Token
pairs: Dict[Expression, Expression]
def token_literal(self) -> str:
return self.token.literal
def __str__(self) -> str:
pairs = ", ".join(f"{key}: {value}" for key, value in self.pairs.items())
return f""
To be continued..
🎰