Jim says
... all very cool.
Parsing an actually complicated language
So I decided to spend this week playing with the same concepts I
have been using last week, but on a harder grammar (as well as putting
down some base work for my plan product). First I played around with a few
different syntax styles (this is just a couple of them):
# VERSION 1
define tick_tock(array[int]:pins): None = {
for i in pins {
setPin HIGH i ;
}
for i in pins {
setPin LOW i ;
}
}
#/
/ Example multi-line comment.
/#
define setup(): None = {
//Example single line comment
pinMode OUTPUT [1,2,3,5]
consoleEcho True
raw_c "LCD.println("STARTING PROGRAM...")
}
define loop(): None = {
tick_tock [1,2,3,4,5]
consoleEcho "Finished Cycle.\n"
}
# VERSION 2
(define tick-tock (lambda (pins)
(map TurnOnPins pins)
(map TurnOffPins pins)))
(define setup
(lambda ()
(pinMode OUTPUT '(1 2 3 4 5))
(set! consoleEcho true)
(raw_c "Serial.Begin(9600)")))
(define loop
(lambda ()
(tick_tock '(1 2 3 4 5))
(consoleEcho "Cycle finished.\n")))
# VERSION 3
let tick_tock (array[int] -> void):
map TurnOn 1..5
map TurnOff 1..5
let setup(void -> void):
pinMode OUTPUT 1..5
setConsoleEcho True
raw_c "Serial.begin(9600)"
let loop(void -> void):
tick_tock 1..5
consoleEcho "Cycle finished."
# VERSION 4
def tick_tock(Array[Ints]: pins): void = {
map TurnOn pins;
map TurnOn pins;
}
But I ended up settling on something like this:
#/
multi line comments example.
Overview:
- Strongly typed.
- All statements end with ";"
- By default all values are immutable.
- function declares require a type signature:
* type sig = a list of args and their types followed
by a "->" and then the return type.
* example: define sum_list([num_lst:Array[Int]] -> Int){ }
- single statements can be made in direct c by calling "raw_c" and
passing it a string of the needed C code.
- functions are invoked by ML/F# syntax:
* <function_name> <values seperated by spaces>
* example: sumList [1,2,3,4,5] ;
- "mutable" is a wrapper which will decode values in mutable memory.
- Functions return mutable data types by default so they can be stored in
dynamic memory.
/#
let list_of_pins : array[int] = [1,2,3,4,5] ; // stored in program mem (immutable)
let on : mutable[int] = 7 * 8 - 5 ; // a dynamically stored variable (mutable)
//still requires a setup fucntion: runs once
def setup([void] -> void) {
pinMode OUTPUT list_of_pins ;
raw_c "Serial.begin(9600)" ;
}
//still requires a loop function: runs repeatedly
def loop([void] -> void) {
while(on){
setPins ON list_of_pins;
delay 500 ; //in millis
setPins OFF list_of_pins;
delay 100 * 5 ;
turn_pattern 1 ON ;
delay 500 ;
5 + 7 * 9;
}
}
def turn_pattern([pattern:int, mode:int] -> void) {
// Turns pins on/off from two different patterns.
let pattern1 : array[int] = [1,2,3,5];
let pattern2 : array[int] = [3,6];
if(pattern == 1){
setPins mode pattern1;
}
if(pattern == 2) {
setPins mode pattern2;
}
}
It pulls a lot of the syntax I like from F# and Scala.
The treatment of types and mutability also draws out the ideas
about how an AVR processor handles data (I can talk more about this).
So I decided to make a grammar:
#lang brag
; main structure
program: (let-statement | define-statement )+
let-statement: let id colon type eq (lit | id | func-call| expr) [delimit]
relet-statement: let id eq (lit | id | func-call | expr) [delimit]
define-statement: def id signature scope-statement
func-call: id (lit | id | func-call | expr)*
statement: (let-statement | func-call | expr | relet-statement) delimit
;code blocks
scope-statement: lbrac (statement|while-loop|conditional)*
rbrac
; Typing statements
type: TYPE | TYPE lSqBrac TYPE rSqBrac ;;; "int" | "Array[int]"
signature: lparen
lSqBrac (id colon type comma)* [id colon] type
rSqBrac arrow (id colon type comma)* [id colon] type
rparen
; control-flow
while-loop: while lparen (comparison | id) rparen scope-statement
conditional: if lparen comparison rparen scope-statement
[else scope-statement]
comparison: (statement| lit| id) bool-comp (statement|lit|id) ;;; "x == y"
; Seperating characters
lbrac: LBRAC ;;; {
rbrac: RBRAC ;;; }
lparen: LPAREN ;;; (
rparen: RPAREN ;;; )
lSqBrac: LSQBRAC ;;; [
rSqBrac: RSQBRAC ;;; ]
comma: COMMA ;;; ,
colon: COLON ;;; :
delimit: SEMI-COLON ;;; ;
eq: EQUAL ;;; =
; Ops taken from http://math.purduecal.edu/~rlkraft/cs31600-2012/chapter03/syntax-examples.html
expr: term [(add | sub) term]*
term: factor [(mult | div) factor]*
factor: (lparen expr rparen | id | lit)
add: ADD-OP
sub: SUB-OP
mult: MULT-OP
div: DIV-OP
; data structure
lit: (int | float | string | array)
array: lSqBrac ((lit comma)* lit) rSqBrac ;;; [lit,lit,...]
int: INT
float: FLOAT
string: STRING
while: WHILE ;;; "while"
if: IF ;;; "if"
else: ELSE ;;; "else"
bool-comp: BOOL-COMP ;;; "==" | ">" ... ect
;;; Type info
arrow: ARROW ;;; "->"
; other keywords
id: ID
def: DEF
let: LET
and a lexer:
#lang racket
#| possible_grammar_lex.rkt
Description:
A lexer for some non-trivial grammar.
To Use:
Open in Dr. Racket and press "run".
TODO:
Come up with actual names for these files.
Break up the lexer file.
Debug the grammar.
Racket 6.6 | 10/2/16 | License MIT
|#
(require parser-tools/lex)
(require parser-tools/lex-sre)
(require brag/support)
(require "possible-grammar.rkt")
(define-lex-abbrev int? (+ (char-range #\0 #\9)))
(define-lex-abbrev float? (: (+ (char-range #\0 #\9))
"."
(+ (char-range #\0 #\9))))
(define-lex-abbrev string? (: #\" (* (char-complement #\")) #\"))
(define-lex-abbrev identifier? (: (+ alphabetic)
(* (or alphabetic #\_ #\! #\? (char-range #\0 #\9)))))
(define-lex-abbrev bool-comp? (or "==" "&&" "||" "<" ">" ">=" "<=" "!="))
(define-lex-abbrev type? (or "int" "string" "float" "array" "mutable" "void"))
(define-lex-abbrev comment? (or (: "//" (* (char-complement #\newline)))
(: "#/" (complement (: any-string "/#" any-string)) "/#")))
(define end-of-file #f)
(define line 0)
(define column 0)
(define offset 0)
;Takes a file port and returns a single token-struct per call.
(define tokenize
(lambda (file)
(set!-values (line column offset) (port-next-location file))
(define find-token
(lexer
[comment? '(COMMENT lexeme #:skip? #t)]
[int? '(INT lexeme)]
[float? '(FLOAT lexeme)]
[string? '(STRING lexeme)]
[type? '(TYPE lexeme)]
[#\* '(MULT-OP lexeme)]
[#\/ '(DIV-OP lexeme)]
[#\- '(SUB-OP lexeme)]
[#\+ '(ADD-OP lexeme)]
;;;[#\! (token 'NEGATE lexeme)] <- soon to be considered
[#\{ '(LBRAC lexeme)]
[#\} '(RBRAC lexeme)]
[#\( '(LPAREN lexeme)]
[#\) '(RPAREN lexeme)]
[#\[ '(LSQBRAC lexeme)]
[#\] '(RSQBRAC lexeme)]
[#\, '(COMMA lexeme)]
[#\: '(COLON lexeme)]
[#\; '(SEMI-COLON lexeme)]
[bool-comp? '(BOOL-COMP lexeme)]
[#\= '(EQUAL lexeme)]
["while" '(WHILE lexeme)]
["if" '(IF lexeme)]
["else" '(ELSE lexeme)]
["->" '(ARROW lexeme)]
["def" '(DEF lexeme)]
["let" '(LET lexeme)]
[identifier? '(ID lexeme)]
[(or #\newline #\space #\tab nothing) '(WHITESPACE lexeme #:skip? #t)]
[(eof) (set! end-of-file #t)]))
(define result (find-token file))
(cond [(equal? result (void)) (void)] ;;;EOF
[(equal? (length result) 2) (token (first result) (second result) #:line line #:column column)] ;;; TOKEN
[else (token (first result) #:skip? #t)]))) ;;; WHITESPACE or COMMENT
(define token-stream '()) ; accumulator
;recursively lexs a file-port accumulating tokens in token-stream.
(define reader
(lambda (file)
(if end-of-file
'()
(and (set! token-stream (append token-stream(list (tokenize file))))
(reader file)))))
(define test (open-input-file "template_syntax.txt")) ;;example file
(port-count-lines! test) ;;enable line counting on port
(printf "File Found.\nTokenizing...\n")
(if (equal? (reader test) '()) ;If no mal-formed return was produced, print success
(printf "File has been tokenized.\nParsing...\n")
(printf "Error tokenizing file!\n"))
(syntax->datum (parse token-stream)) ;; parse and return a datum
Running it on the example file (with one function removed) above produces this syntax tree:
Which is far too big and totally breaks CS.
I worked out the AST->datum by hand and it looks to be correct. I have attached the AST below.
Other than that, I also figured out how to get lines and columns in parse errors.
I have attached a zip of the project that should be "ready-to-run."