127 "end": (0, None, None), |
127 "end": (0, None, None), |
128 } |
128 } |
129 |
129 |
130 keywords = set(['and', 'or', 'not']) |
130 keywords = set(['and', 'or', 'not']) |
131 |
131 |
132 def tokenize(program, lookup=None): |
132 # default set of valid characters for the initial letter of symbols |
|
133 _syminitletters = set(c for c in [chr(i) for i in xrange(256)] |
|
134 if c.isalnum() or c in '._@' or ord(c) > 127) |
|
135 |
|
136 # default set of valid characters for non-initial letters of symbols |
|
137 _symletters = set(c for c in [chr(i) for i in xrange(256)] |
|
138 if c.isalnum() or c in '-._/@' or ord(c) > 127) |
|
139 |
|
140 def tokenize(program, lookup=None, syminitletters=None, symletters=None): |
133 ''' |
141 ''' |
134 Parse a revset statement into a stream of tokens |
142 Parse a revset statement into a stream of tokens |
|
143 |
|
144 ``syminitletters`` is the set of valid characters for the initial |
|
145 letter of symbols. |
|
146 |
|
147 By default, character ``c`` is recognized as valid for initial |
|
148 letter of symbols, if ``c.isalnum() or c in '._@' or ord(c) > 127``. |
|
149 |
|
150 ``symletters`` is the set of valid characters for non-initial |
|
151 letters of symbols. |
|
152 |
|
153 By default, character ``c`` is recognized as valid for non-initial |
|
154 letters of symbols, if ``c.isalnum() or c in '-._/@' or ord(c) > 127``. |
135 |
155 |
136 Check that @ is a valid unquoted token character (issue3686): |
156 Check that @ is a valid unquoted token character (issue3686): |
137 >>> list(tokenize("@::")) |
157 >>> list(tokenize("@::")) |
138 [('symbol', '@', 0), ('::', None, 1), ('end', None, 3)] |
158 [('symbol', '@', 0), ('::', None, 1), ('end', None, 3)] |
139 |
159 |
140 ''' |
160 ''' |
|
161 if syminitletters is None: |
|
162 syminitletters = _syminitletters |
|
163 if symletters is None: |
|
164 symletters = _symletters |
141 |
165 |
142 pos, l = 0, len(program) |
166 pos, l = 0, len(program) |
143 while pos < l: |
167 while pos < l: |
144 c = program[pos] |
168 c = program[pos] |
145 if c.isspace(): # skip inter-token whitespace |
169 if c.isspace(): # skip inter-token whitespace |
175 break |
199 break |
176 pos += 1 |
200 pos += 1 |
177 else: |
201 else: |
178 raise error.ParseError(_("unterminated string"), s) |
202 raise error.ParseError(_("unterminated string"), s) |
179 # gather up a symbol/keyword |
203 # gather up a symbol/keyword |
180 elif c.isalnum() or c in '._@' or ord(c) > 127: |
204 elif c in syminitletters: |
181 s = pos |
205 s = pos |
182 pos += 1 |
206 pos += 1 |
183 while pos < l: # find end of symbol |
207 while pos < l: # find end of symbol |
184 d = program[pos] |
208 d = program[pos] |
185 if not (d.isalnum() or d in "-._/@" or ord(d) > 127): |
209 if d not in symletters: |
186 break |
210 break |
187 if d == '.' and program[pos - 1] == '.': # special case for .. |
211 if d == '.' and program[pos - 1] == '.': # special case for .. |
188 pos -= 1 |
212 pos -= 1 |
189 break |
213 break |
190 pos += 1 |
214 pos += 1 |