forked from simdjson/simdjson
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathstage1.cpp
More file actions
209 lines (187 loc) · 7.4 KB
/
stage1.cpp
File metadata and controls
209 lines (187 loc) · 7.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
#include "simdjson.h"
#include "fallback/implementation.h"
namespace simdjson {
namespace fallback {
namespace stage1 {
class structural_scanner {
public:
really_inline structural_scanner(const uint8_t *_buf, uint32_t _len, parser &_doc_parser, bool _streaming)
: buf{_buf}, next_structural_index{_doc_parser.structural_indexes.get()}, doc_parser{_doc_parser}, idx{0}, len{_len}, error{SUCCESS}, streaming{_streaming} {}
really_inline void add_structural() {
*next_structural_index = idx;
next_structural_index++;
}
really_inline bool is_continuation(uint8_t c) {
return (c & 0b11000000) == 0b10000000;
}
really_inline void validate_utf8_character() {
// Continuation
if (unlikely((buf[idx] & 0b01000000) == 0)) {
// extra continuation
error = UTF8_ERROR;
idx++;
return;
}
// 2-byte
if ((buf[idx] & 0b00100000) == 0) {
// missing continuation
if (unlikely(idx+1 > len || !is_continuation(buf[idx+1]))) { error = UTF8_ERROR; idx++; return; }
// overlong: 1100000_ 10______
if (buf[idx] <= 0b11000001) { error = UTF8_ERROR; }
idx += 2;
return;
}
// 3-byte
if ((buf[idx] & 0b00010000) == 0) {
// missing continuation
if (unlikely(idx+2 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]))) { error = UTF8_ERROR; idx++; return; }
// overlong: 11100000 100_____ ________
if (buf[idx] == 0b11100000 && buf[idx+1] <= 0b10011111) { error = UTF8_ERROR; }
// surrogates: U+D800-U+DFFF 11101101 101_____
if (buf[idx] == 0b11101101 && buf[idx+1] >= 0b10100000) { error = UTF8_ERROR; }
idx += 3;
return;
}
// 4-byte
// missing continuation
if (unlikely(idx+3 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]) || !is_continuation(buf[idx+3]))) { error = UTF8_ERROR; idx++; return; }
// overlong: 11110000 1000____ ________ ________
if (buf[idx] == 0b11110000 && buf[idx+1] <= 0b10001111) { error = UTF8_ERROR; }
// too large: > U+10FFFF:
// 11110100 (1001|101_)____
// 1111(1___|011_|0101) 10______
// also includes 5, 6, 7 and 8 byte characters:
// 11111___
if (buf[idx] == 0b11110100 && buf[idx+1] >= 0b10010000) { error = UTF8_ERROR; }
if (buf[idx] >= 0b11110101) { error = UTF8_ERROR; }
idx += 4;
}
really_inline void validate_string() {
idx++; // skip first quote
while (idx < len && buf[idx] != '"') {
if (buf[idx] == '\\') {
idx += 2;
} else if (unlikely(buf[idx] & 0b10000000)) {
validate_utf8_character();
} else {
if (buf[idx] < 0x20) { error = UNESCAPED_CHARS; }
idx++;
}
}
if (idx >= len && !streaming) { error = UNCLOSED_STRING; }
}
really_inline bool is_whitespace_or_operator(uint8_t c) {
switch (c) {
case '{': case '}': case '[': case ']': case ',': case ':':
case ' ': case '\r': case '\n': case '\t':
return true;
default:
return false;
}
}
//
// Parse the entire input in STEP_SIZE-byte chunks.
//
really_inline error_code scan() {
for (;idx<len;idx++) {
switch (buf[idx]) {
// String
case '"':
add_structural();
validate_string();
break;
// Operator
case '{': case '}': case '[': case ']': case ',': case ':':
add_structural();
break;
// Whitespace
case ' ': case '\r': case '\n': case '\t':
break;
// Primitive or invalid character (invalid characters will be checked in stage 2)
default:
// Anything else, add the structural and go until we find the next one
add_structural();
while (idx+1<len && !is_whitespace_or_operator(buf[idx+1])) {
idx++;
};
break;
}
}
if (unlikely(next_structural_index == doc_parser.structural_indexes.get())) {
return EMPTY;
}
*next_structural_index = len;
next_structural_index++;
doc_parser.n_structural_indexes = uint32_t(next_structural_index - doc_parser.structural_indexes.get());
return error;
}
private:
const uint8_t *buf;
uint32_t *next_structural_index;
parser &doc_parser;
uint32_t idx;
uint32_t len;
error_code error;
bool streaming;
}; // structural_scanner
} // namespace stage1
WARN_UNUSED error_code implementation::stage1(const uint8_t *buf, size_t len, parser &parser, bool streaming) const noexcept {
if (unlikely(len > parser.capacity())) {
return CAPACITY;
}
stage1::structural_scanner scanner(buf, uint32_t(len), parser, streaming);
return scanner.scan();
}
// big table for the minifier
static uint8_t jump_table[256 * 3] = {
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1,
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0,
1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
};
WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
size_t i = 0, pos = 0;
uint8_t quote = 0;
uint8_t nonescape = 1;
while (i < len) {
unsigned char c = buf[i];
uint8_t *meta = jump_table + 3 * c;
quote = quote ^ (meta[0] & nonescape);
dst[pos] = c;
pos += meta[2] | quote;
i += 1;
nonescape = uint8_t(~nonescape) | (meta[1]);
}
dst_len = pos; // we intentionally do not work with a reference
// for fear of aliasing
return SUCCESS;
}
} // namespace fallback
} // namespace simdjson