-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathLex.fs
More file actions
99 lines (88 loc) · 3.33 KB
/
Lex.fs
File metadata and controls
99 lines (88 loc) · 3.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
/// Lexical Tokenisation for Mutton
///
/// This module is responsible for running a simple state machine over input
/// text and producing a seuqence of tokens for the `Parse` stage to consume.
module Mutton.Lex
open System
open System.Text
/// Token kind enumeration
type public TokenKind =
| Error
| Sym
| OpenParen
| CloseParen
| Number
| Space
| EOF
/// Current lexer state
type private LexState =
| Start
| Ident
| Number
| Space
// Used for single-character tokens. This avoids us needing many states for
// each different single character lexeme.
| Simple of TokenKind
/// Tokenise the given `input`
///
/// Runs the lexical state machine over the given `input` text and produces a
/// sequence of `string * TokenKind` pairs for each recognised lexeme.
let public lex input =
/// Convert lexer final states to their corresponding tokens. This is
/// a trivial mapping as all states are final other than the `Start`, which
/// we map to the `Error` token.
let kindForState =
function
| LexState.Ident -> TokenKind.Sym
| LexState.Start -> TokenKind.Error
| LexState.Space -> TokenKind.Space
| LexState.Number -> TokenKind.Number
| LexState.Simple s -> s
/// Compute the next transiation for the state machine. Inspects the current
/// state and input character to deduce a transition, if any. Returns `None`
/// if no transition exists.
let next ch =
function
| LexState.Start ->
match ch with
| c when Char.IsWhiteSpace(c) -> Some(Space)
| c when Char.IsNumber(c) -> Some(Number)
| '(' -> Some(Simple(TokenKind.OpenParen))
| ')' -> Some(Simple(TokenKind.CloseParen))
| _ -> Some(Ident)
| LexState.Ident ->
match ch with
| ')'
| '(' -> None
| c when Char.IsWhiteSpace(c) -> None
| _ -> Some(Ident)
| LexState.Number -> if Char.IsNumber(ch) then Some(Number) else None
| LexState.Space -> if Char.IsWhiteSpace(ch) then Some(Space) else None
| _ -> None
let mutable state = LexState.Start
let mutable lexeme = StringBuilder()
seq {
for c in input do
// For each char in the input we compute the next transition. If one
// exists we take it. If no transition exists we emit a token, and
// compute the next transition again from the `Start` state.
match next c state with
| Some next ->
state <- next
lexeme <- lexeme.Append(c)
| None ->
yield ((lexeme.ToString()), (kindForState state))
lexeme <- lexeme.Clear()
lexeme <- lexeme.Append(c)
match next c Start with
| Some next -> state <- next
| None ->
state <- Start
yield ((lexeme.ToString()), TokenKind.Error)
lexeme <- lexeme.Clear()
// If the state machine was left with a token partially buffered then
// emit that. This handles the case taht multi-char tokens such as
// identifiers are the last item in the source.
if state <> Start then
yield ((lexeme.ToString()), (kindForState state))
}