-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathm17n_sedlexing.ml
186 lines (166 loc) · 5.38 KB
/
m17n_sedlexing.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
type lexbuf = {
mutable slex_start_p : Lexing.position;
mutable slex_curr : (int * Uchar.t) GenClone.t;
mutable slex_curr_g : (int * Uchar.t) Gen.t;
mutable slex_curr_p : Lexing.position;
mutable slex_lexeme : Uchar.t list;
mutable slex_slot : int;
mutable slex_mem : (int * Uchar.t) GenClone.t;
mutable slex_mem_p : Lexing.position;
mutable slex_mem_lexeme : Uchar.t list;
}
(* Process a `string gen` and return an `(int, uchar) gen`, iterating
decoded Unicode characters, together with their lengths in
the UTF-8 *byte* representation. *)
let decoder kind input =
let uutf = Uutf.decoder ~nln:(`ASCII (Uchar.of_int 0x000A)) ~encoding:`UTF_8 `Manual in
let pos = ref 0 in
let rec gen () =
match Uutf.decode uutf with
| `End -> None
| `Uchar u ->
if kind = `Batch then
Some (1, u)
else
let pos' = Uutf.decoder_byte_count uutf in
let len = pos' - !pos in
pos := pos';
Some (len, u)
| `Await -> (* We exhausted the buffer. *)
begin match input () with
| None -> (* We exhausted the input. *)
Uutf.Manual.src uutf (Bytes.of_string "") 0 0
| Some (chunk, start, len) -> (* There's some more input. *)
Uutf.Manual.src uutf chunk start len
end;
gen ()
| `Malformed bytes -> (* The input is malformed. *)
(* Return U+FFFD, it will be handled by the lexer. *)
if kind = `Batch then
Some (1, Uutf.u_rep)
else
Some (String.length bytes, Uutf.u_rep)
in
gen
let create ?(kind=`Batch) ?(filename="//unknown//") input =
let pos = Lexing.{
pos_fname = filename;
pos_lnum = 1;
pos_bol = 0;
pos_cnum = 0; } in
let gen = GenMList.(to_clonable
(of_gen_lazy ~caching:(kind <> `Toplevel) (decoder kind input))) in
{ slex_start_p = pos;
slex_curr = gen;
slex_curr_g = gen#gen;
slex_curr_p = pos;
slex_lexeme = [];
slex_slot = -1;
slex_mem = gen#clone;
slex_mem_p = pos;
slex_mem_lexeme = []; }
let memorize lexbuf =
lexbuf.slex_mem_lexeme <- lexbuf.slex_lexeme;
lexbuf.slex_mem_p <- lexbuf.slex_curr_p;
lexbuf.slex_mem <- lexbuf.slex_curr#clone
let start lexbuf =
lexbuf.slex_start_p <- lexbuf.slex_curr_p;
lexbuf.slex_lexeme <- [];
lexbuf.slex_slot <- -1;
memorize lexbuf
let next lexbuf =
let open Lexing in
match lexbuf.slex_curr_g () with
| None -> -1
| Some (len, uchar) ->
let pos = lexbuf.slex_curr_p in
if Uchar.to_int uchar = 0x000A then
lexbuf.slex_curr_p <- { pos with
pos_lnum = pos.pos_lnum + 1;
pos_cnum = pos.pos_cnum + len;
pos_bol = pos.pos_cnum + len; }
else
lexbuf.slex_curr_p <- { pos with
pos_cnum = pos.pos_cnum + len; };
lexbuf.slex_lexeme <- uchar :: lexbuf.slex_lexeme;
Uchar.to_int uchar
let mark lexbuf slot =
lexbuf.slex_slot <- slot;
memorize lexbuf
let backtrack lexbuf =
let slot = lexbuf.slex_slot in
lexbuf.slex_curr <- lexbuf.slex_mem#clone;
lexbuf.slex_curr_g <- lexbuf.slex_curr#gen;
lexbuf.slex_curr_p <- lexbuf.slex_mem_p;
lexbuf.slex_lexeme <- lexbuf.slex_mem_lexeme;
slot
let lexeme lexbuf =
List.rev lexbuf.slex_lexeme
let lexeme_char n lexbuf =
List.nth (lexeme lexbuf) n
let sub_lexeme (lft, rgt) lexbuf =
let rec drop i =
function
| x::lst when i > 0 -> drop (i-1) lst
| [] when i > 0 -> assert false
| lst -> lst
in
let map i = if i >= 0 then i else -(i + 1) in
lexbuf.slex_lexeme |> drop (map rgt) |>
List.rev |> drop (map lft)
let fill_lexbuf lexbuf oldlexbuf =
let open Lexing in
oldlexbuf.lex_start_p <- lexbuf.slex_start_p;
oldlexbuf.lex_curr_p <- lexbuf.slex_curr_p
let uunf_normalize form uchars =
let buf = Buffer.create (List.length uchars) in
let uunf = Uunf.create form in
let rec add uchar =
match Uunf.add uunf uchar with
| `Uchar u -> Uutf.Buffer.add_utf_8 buf u; add `Await
| `Await | `End -> ()
in
List.iter (fun uchar -> add (`Uchar uchar)) uchars;
add `End;
Buffer.contents buf
let encode ?normalize uchars =
match normalize with
| None ->
let buf = Buffer.create (List.length uchars) in
List.iter (Uutf.Buffer.add_utf_8 buf) uchars;
Buffer.contents buf
| Some form ->
uunf_normalize form uchars
let utf8_lexeme ?normalize lexbuf =
encode ?normalize (lexeme lexbuf)
let utf8_sub_lexeme ?normalize range lexbuf =
encode ?normalize (sub_lexeme range lexbuf)
let expand_token lexbuf f =
let start_p = lexbuf.slex_start_p in
let result = f () in
lexbuf.slex_start_p <- start_p;
result
let location lexbuf =
Location.{
loc_ghost = false;
loc_start = lexbuf.slex_start_p;
loc_end = lexbuf.slex_curr_p; }
let set_position lexbuf file line =
let open Lexing in
lexbuf.slex_start_p <- { lexbuf.slex_start_p with
pos_fname = file;
pos_lnum = line; }
let unshift lexbuf =
let open Lexing in
match lexbuf.slex_lexeme with
| [] -> assert false
| uchar :: lexeme ->
assert (Uchar.to_int uchar <> 0x000A &&
Uchar.to_int uchar < 0x0100);
let slex_curr = GenClone.to_prependable lexbuf.slex_curr in
slex_curr#prepend (1, uchar);
lexbuf.slex_curr <- (slex_curr :> (int * Uchar.t) GenClone.t);
lexbuf.slex_curr_g <- slex_curr#gen;
lexbuf.slex_curr_p <- { lexbuf.slex_curr_p with
pos_cnum = lexbuf.slex_curr_p.pos_cnum - 1; };
lexbuf.slex_lexeme <- lexeme