Mercurial > hg
comparison rust/hg-core/src/revlog/path_encode.rs @ 45539:aebc976fd7d5
hg-core: add path_encode
Differential Revision: https://phab.mercurial-scm.org/D9049
author | Antoine Cezar <antoine.cezar@octobus.net> |
---|---|
date | Tue, 15 Sep 2020 16:45:27 +0200 |
parents | |
children | fad504cfc94b |
comparison
equal
deleted
inserted
replaced
45538:2d5dfc8fed55 | 45539:aebc976fd7d5 |
---|---|
1 use crypto::digest::Digest; | |
2 use crypto::sha1::Sha1; | |
3 | |
4 #[derive(PartialEq, Debug)] | |
5 #[allow(non_camel_case_types)] | |
6 enum path_state { | |
7 START, /* first byte of a path component */ | |
8 A, /* "AUX" */ | |
9 AU, | |
10 THIRD, /* third of a 3-byte sequence, e.g. "AUX", "NUL" */ | |
11 C, /* "CON" or "COMn" */ | |
12 CO, | |
13 COMLPT, /* "COM" or "LPT" */ | |
14 COMLPTn, | |
15 L, | |
16 LP, | |
17 N, | |
18 NU, | |
19 P, /* "PRN" */ | |
20 PR, | |
21 LDOT, /* leading '.' */ | |
22 DOT, /* '.' in a non-leading position */ | |
23 H, /* ".h" */ | |
24 HGDI, /* ".hg", ".d", or ".i" */ | |
25 SPACE, | |
26 DEFAULT, /* byte of a path component after the first */ | |
27 } | |
28 | |
29 /* state machine for dir-encoding */ | |
30 #[allow(non_camel_case_types)] | |
31 enum dir_state { | |
32 DDOT, | |
33 DH, | |
34 DHGDI, | |
35 DDEFAULT, | |
36 } | |
37 | |
38 fn inset(bitset: &[u32; 8], c: u8) -> bool { | |
39 bitset[(c as usize) >> 5] & (1 << (c & 31)) != 0 | |
40 } | |
41 | |
42 fn charcopy(dest: Option<&mut [u8]>, destlen: &mut usize, c: u8) { | |
43 if let Some(slice) = dest { | |
44 slice[*destlen] = c | |
45 } | |
46 *destlen += 1 | |
47 } | |
48 | |
49 fn memcopy(dest: Option<&mut [u8]>, destlen: &mut usize, src: &[u8]) { | |
50 if let Some(slice) = dest { | |
51 slice[*destlen..*destlen + src.len()].copy_from_slice(src) | |
52 } | |
53 *destlen += src.len(); | |
54 } | |
55 | |
56 fn rewrap_option<'a, 'b: 'a>( | |
57 x: &'a mut Option<&'b mut [u8]>, | |
58 ) -> Option<&'a mut [u8]> { | |
59 match x { | |
60 None => None, | |
61 Some(y) => Some(y), | |
62 } | |
63 } | |
64 | |
65 fn hexencode<'a>(mut dest: Option<&'a mut [u8]>, destlen: &mut usize, c: u8) { | |
66 let hexdigit = b"0123456789abcdef"; | |
67 charcopy( | |
68 rewrap_option(&mut dest), | |
69 destlen, | |
70 hexdigit[(c as usize) >> 4], | |
71 ); | |
72 charcopy(dest, destlen, hexdigit[(c as usize) & 15]); | |
73 } | |
74 | |
75 /* 3-byte escape: tilde followed by two hex digits */ | |
76 fn escape3(mut dest: Option<&mut [u8]>, destlen: &mut usize, c: u8) { | |
77 charcopy(rewrap_option(&mut dest), destlen, b'~'); | |
78 hexencode(dest, destlen, c); | |
79 } | |
80 | |
81 fn encode_dir(mut dest: Option<&mut [u8]>, src: &[u8]) -> usize { | |
82 let mut state = dir_state::DDEFAULT; | |
83 let mut i = 0; | |
84 let mut destlen = 0; | |
85 | |
86 while i < src.len() { | |
87 match state { | |
88 dir_state::DDOT => match src[i] { | |
89 b'd' | b'i' => { | |
90 state = dir_state::DHGDI; | |
91 charcopy(rewrap_option(&mut dest), &mut destlen, src[i]); | |
92 i += 1; | |
93 } | |
94 b'h' => { | |
95 state = dir_state::DH; | |
96 charcopy(rewrap_option(&mut dest), &mut destlen, src[i]); | |
97 i += 1; | |
98 } | |
99 _ => { | |
100 state = dir_state::DDEFAULT; | |
101 } | |
102 }, | |
103 dir_state::DH => { | |
104 if src[i] == b'g' { | |
105 state = dir_state::DHGDI; | |
106 charcopy(rewrap_option(&mut dest), &mut destlen, src[i]); | |
107 i += 1; | |
108 } else { | |
109 state = dir_state::DDEFAULT; | |
110 } | |
111 } | |
112 dir_state::DHGDI => { | |
113 if src[i] == b'/' { | |
114 memcopy(rewrap_option(&mut dest), &mut destlen, b".hg"); | |
115 charcopy(rewrap_option(&mut dest), &mut destlen, src[i]); | |
116 i += 1; | |
117 } | |
118 state = dir_state::DDEFAULT; | |
119 } | |
120 dir_state::DDEFAULT => { | |
121 if src[i] == b'.' { | |
122 state = dir_state::DDOT | |
123 } | |
124 charcopy(rewrap_option(&mut dest), &mut destlen, src[i]); | |
125 i += 1; | |
126 } | |
127 } | |
128 } | |
129 destlen | |
130 } | |
131 | |
132 fn _encode( | |
133 twobytes: &[u32; 8], | |
134 onebyte: &[u32; 8], | |
135 mut dest: Option<&mut [u8]>, | |
136 src: &[u8], | |
137 encodedir: bool, | |
138 ) -> usize { | |
139 let mut state = path_state::START; | |
140 let mut i = 0; | |
141 let mut destlen = 0; | |
142 let len = src.len(); | |
143 | |
144 while i < len { | |
145 match state { | |
146 path_state::START => match src[i] { | |
147 b'/' => { | |
148 charcopy(rewrap_option(&mut dest), &mut destlen, src[i]); | |
149 i += 1; | |
150 } | |
151 b'.' => { | |
152 state = path_state::LDOT; | |
153 escape3(rewrap_option(&mut dest), &mut destlen, src[i]); | |
154 i += 1; | |
155 } | |
156 b' ' => { | |
157 state = path_state::DEFAULT; | |
158 escape3(rewrap_option(&mut dest), &mut destlen, src[i]); | |
159 i += 1; | |
160 } | |
161 b'a' => { | |
162 state = path_state::A; | |
163 charcopy(rewrap_option(&mut dest), &mut destlen, src[i]); | |
164 i += 1; | |
165 } | |
166 b'c' => { | |
167 state = path_state::C; | |
168 charcopy(rewrap_option(&mut dest), &mut destlen, src[i]); | |
169 i += 1; | |
170 } | |
171 b'l' => { | |
172 state = path_state::L; | |
173 charcopy(rewrap_option(&mut dest), &mut destlen, src[i]); | |
174 i += 1; | |
175 } | |
176 b'n' => { | |
177 state = path_state::N; | |
178 charcopy(rewrap_option(&mut dest), &mut destlen, src[i]); | |
179 i += 1; | |
180 } | |
181 b'p' => { | |
182 state = path_state::P; | |
183 charcopy(rewrap_option(&mut dest), &mut destlen, src[i]); | |
184 i += 1; | |
185 } | |
186 _ => { | |
187 state = path_state::DEFAULT; | |
188 } | |
189 }, | |
190 path_state::A => { | |
191 if src[i] == b'u' { | |
192 state = path_state::AU; | |
193 charcopy(rewrap_option(&mut dest), &mut destlen, src[i]); | |
194 i += 1; | |
195 } else { | |
196 state = path_state::DEFAULT; | |
197 } | |
198 } | |
199 path_state::AU => { | |
200 if src[i] == b'x' { | |
201 state = path_state::THIRD; | |
202 i += 1; | |
203 } else { | |
204 state = path_state::DEFAULT; | |
205 } | |
206 } | |
207 path_state::THIRD => { | |
208 state = path_state::DEFAULT; | |
209 match src[i] { | |
210 b'.' | b'/' | b'\0' => escape3( | |
211 rewrap_option(&mut dest), | |
212 &mut destlen, | |
213 src[i - 1], | |
214 ), | |
215 _ => i -= 1, | |
216 } | |
217 } | |
218 path_state::C => { | |
219 if src[i] == b'o' { | |
220 state = path_state::CO; | |
221 charcopy(rewrap_option(&mut dest), &mut destlen, src[i]); | |
222 i += 1; | |
223 } else { | |
224 state = path_state::DEFAULT; | |
225 } | |
226 } | |
227 path_state::CO => { | |
228 if src[i] == b'm' { | |
229 state = path_state::COMLPT; | |
230 i += 1; | |
231 } else if src[i] == b'n' { | |
232 state = path_state::THIRD; | |
233 i += 1; | |
234 } else { | |
235 state = path_state::DEFAULT; | |
236 } | |
237 } | |
238 path_state::COMLPT => { | |
239 if src[i] >= b'1' && src[i] <= b'9' { | |
240 state = path_state::COMLPTn; | |
241 i += 1; | |
242 } else { | |
243 state = path_state::DEFAULT; | |
244 charcopy( | |
245 rewrap_option(&mut dest), | |
246 &mut destlen, | |
247 src[i - 1], | |
248 ); | |
249 } | |
250 } | |
251 path_state::COMLPTn => { | |
252 state = path_state::DEFAULT; | |
253 match src[i] { | |
254 b'.' | b'/' | b'\0' => { | |
255 escape3( | |
256 rewrap_option(&mut dest), | |
257 &mut destlen, | |
258 src[i - 2], | |
259 ); | |
260 charcopy( | |
261 rewrap_option(&mut dest), | |
262 &mut destlen, | |
263 src[i - 1], | |
264 ); | |
265 } | |
266 _ => { | |
267 memcopy( | |
268 rewrap_option(&mut dest), | |
269 &mut destlen, | |
270 &src[i - 2..i], | |
271 ); | |
272 } | |
273 } | |
274 } | |
275 path_state::L => { | |
276 if src[i] == b'p' { | |
277 state = path_state::LP; | |
278 charcopy(rewrap_option(&mut dest), &mut destlen, src[i]); | |
279 i += 1; | |
280 } else { | |
281 state = path_state::DEFAULT; | |
282 } | |
283 } | |
284 path_state::LP => { | |
285 if src[i] == b't' { | |
286 state = path_state::COMLPT; | |
287 i += 1; | |
288 } else { | |
289 state = path_state::DEFAULT; | |
290 } | |
291 } | |
292 path_state::N => { | |
293 if src[i] == b'u' { | |
294 state = path_state::NU; | |
295 charcopy(rewrap_option(&mut dest), &mut destlen, src[i]); | |
296 i += 1; | |
297 } else { | |
298 state = path_state::DEFAULT; | |
299 } | |
300 } | |
301 path_state::NU => { | |
302 if src[i] == b'l' { | |
303 state = path_state::THIRD; | |
304 i += 1; | |
305 } else { | |
306 state = path_state::DEFAULT; | |
307 } | |
308 } | |
309 path_state::P => { | |
310 if src[i] == b'r' { | |
311 state = path_state::PR; | |
312 charcopy(rewrap_option(&mut dest), &mut destlen, src[i]); | |
313 i += 1; | |
314 } else { | |
315 state = path_state::DEFAULT; | |
316 } | |
317 } | |
318 path_state::PR => { | |
319 if src[i] == b'n' { | |
320 state = path_state::THIRD; | |
321 i += 1; | |
322 } else { | |
323 state = path_state::DEFAULT; | |
324 } | |
325 } | |
326 path_state::LDOT => match src[i] { | |
327 b'd' | b'i' => { | |
328 state = path_state::HGDI; | |
329 charcopy(rewrap_option(&mut dest), &mut destlen, src[i]); | |
330 i += 1; | |
331 } | |
332 b'h' => { | |
333 state = path_state::H; | |
334 charcopy(rewrap_option(&mut dest), &mut destlen, src[i]); | |
335 i += 1; | |
336 } | |
337 _ => { | |
338 state = path_state::DEFAULT; | |
339 } | |
340 }, | |
341 path_state::DOT => match src[i] { | |
342 b'/' | b'\0' => { | |
343 state = path_state::START; | |
344 memcopy(rewrap_option(&mut dest), &mut destlen, b"~2e"); | |
345 charcopy(rewrap_option(&mut dest), &mut destlen, src[i]); | |
346 i += 1; | |
347 } | |
348 b'd' | b'i' => { | |
349 state = path_state::HGDI; | |
350 charcopy(rewrap_option(&mut dest), &mut destlen, b'.'); | |
351 charcopy(rewrap_option(&mut dest), &mut destlen, src[i]); | |
352 i += 1; | |
353 } | |
354 b'h' => { | |
355 state = path_state::H; | |
356 memcopy(rewrap_option(&mut dest), &mut destlen, b".h"); | |
357 i += 1; | |
358 } | |
359 _ => { | |
360 state = path_state::DEFAULT; | |
361 charcopy(rewrap_option(&mut dest), &mut destlen, b'.'); | |
362 } | |
363 }, | |
364 path_state::H => { | |
365 if src[i] == b'g' { | |
366 state = path_state::HGDI; | |
367 charcopy(rewrap_option(&mut dest), &mut destlen, src[i]); | |
368 i += 1; | |
369 } else { | |
370 state = path_state::DEFAULT; | |
371 } | |
372 } | |
373 path_state::HGDI => { | |
374 if src[i] == b'/' { | |
375 state = path_state::START; | |
376 if encodedir { | |
377 memcopy( | |
378 rewrap_option(&mut dest), | |
379 &mut destlen, | |
380 b".hg", | |
381 ); | |
382 } | |
383 charcopy(rewrap_option(&mut dest), &mut destlen, src[i]); | |
384 i += 1 | |
385 } else { | |
386 state = path_state::DEFAULT; | |
387 } | |
388 } | |
389 path_state::SPACE => match src[i] { | |
390 b'/' | b'\0' => { | |
391 state = path_state::START; | |
392 memcopy(rewrap_option(&mut dest), &mut destlen, b"~20"); | |
393 charcopy(rewrap_option(&mut dest), &mut destlen, src[i]); | |
394 i += 1; | |
395 } | |
396 _ => { | |
397 state = path_state::DEFAULT; | |
398 charcopy(rewrap_option(&mut dest), &mut destlen, b' '); | |
399 } | |
400 }, | |
401 path_state::DEFAULT => { | |
402 while i != len && inset(onebyte, src[i]) { | |
403 charcopy(rewrap_option(&mut dest), &mut destlen, src[i]); | |
404 i += 1; | |
405 } | |
406 if i == len { | |
407 break; | |
408 } | |
409 match src[i] { | |
410 b'.' => { | |
411 state = path_state::DOT; | |
412 i += 1 | |
413 } | |
414 b' ' => { | |
415 state = path_state::SPACE; | |
416 i += 1 | |
417 } | |
418 b'/' => { | |
419 state = path_state::START; | |
420 charcopy(rewrap_option(&mut dest), &mut destlen, b'/'); | |
421 i += 1; | |
422 } | |
423 _ => { | |
424 if inset(onebyte, src[i]) { | |
425 loop { | |
426 charcopy( | |
427 rewrap_option(&mut dest), | |
428 &mut destlen, | |
429 src[i], | |
430 ); | |
431 i += 1; | |
432 if !(i < len && inset(onebyte, src[i])) { | |
433 break; | |
434 } | |
435 } | |
436 } else if inset(twobytes, src[i]) { | |
437 let c = src[i]; | |
438 i += 1; | |
439 charcopy( | |
440 rewrap_option(&mut dest), | |
441 &mut destlen, | |
442 b'_', | |
443 ); | |
444 charcopy( | |
445 rewrap_option(&mut dest), | |
446 &mut destlen, | |
447 if c == b'_' { b'_' } else { c + 32 }, | |
448 ); | |
449 } else { | |
450 escape3( | |
451 rewrap_option(&mut dest), | |
452 &mut destlen, | |
453 src[i], | |
454 ); | |
455 i += 1; | |
456 } | |
457 } | |
458 } | |
459 } | |
460 } | |
461 } | |
462 match state { | |
463 path_state::START => (), | |
464 path_state::A => (), | |
465 path_state::AU => (), | |
466 path_state::THIRD => { | |
467 escape3(rewrap_option(&mut dest), &mut destlen, src[i - 1]) | |
468 } | |
469 path_state::C => (), | |
470 path_state::CO => (), | |
471 path_state::COMLPT => { | |
472 charcopy(rewrap_option(&mut dest), &mut destlen, src[i - 1]) | |
473 } | |
474 path_state::COMLPTn => { | |
475 escape3(rewrap_option(&mut dest), &mut destlen, src[i - 2]); | |
476 charcopy(rewrap_option(&mut dest), &mut destlen, src[i - 1]); | |
477 } | |
478 path_state::L => (), | |
479 path_state::LP => (), | |
480 path_state::N => (), | |
481 path_state::NU => (), | |
482 path_state::P => (), | |
483 path_state::PR => (), | |
484 path_state::LDOT => (), | |
485 path_state::DOT => { | |
486 memcopy(rewrap_option(&mut dest), &mut destlen, b"~2e"); | |
487 } | |
488 path_state::H => (), | |
489 path_state::HGDI => (), | |
490 path_state::SPACE => { | |
491 memcopy(rewrap_option(&mut dest), &mut destlen, b"~20"); | |
492 } | |
493 path_state::DEFAULT => (), | |
494 }; | |
495 destlen | |
496 } | |
497 | |
498 fn basic_encode(dest: Option<&mut [u8]>, src: &[u8]) -> usize { | |
499 let twobytes: [u32; 8] = [0, 0, 0x87ff_fffe, 0, 0, 0, 0, 0]; | |
500 let onebyte: [u32; 8] = | |
501 [1, 0x2bff_3bfa, 0x6800_0001, 0x2fff_ffff, 0, 0, 0, 0]; | |
502 _encode(&twobytes, &onebyte, dest, src, true) | |
503 } | |
504 | |
505 const MAXSTOREPATHLEN: usize = 120; | |
506 | |
507 fn lower_encode(mut dest: Option<&mut [u8]>, src: &[u8]) -> usize { | |
508 let onebyte: [u32; 8] = | |
509 [1, 0x2bff_fbfb, 0xe800_0001, 0x2fff_ffff, 0, 0, 0, 0]; | |
510 let lower: [u32; 8] = [0, 0, 0x07ff_fffe, 0, 0, 0, 0, 0]; | |
511 let mut destlen = 0; | |
512 for c in src { | |
513 if inset(&onebyte, *c) { | |
514 charcopy(rewrap_option(&mut dest), &mut destlen, *c) | |
515 } else if inset(&lower, *c) { | |
516 charcopy(rewrap_option(&mut dest), &mut destlen, *c + 32) | |
517 } else { | |
518 escape3(rewrap_option(&mut dest), &mut destlen, *c) | |
519 } | |
520 } | |
521 destlen | |
522 } | |
523 | |
524 fn aux_encode(dest: Option<&mut [u8]>, src: &[u8]) -> usize { | |
525 let twobytes = [0; 8]; | |
526 let onebyte: [u32; 8] = [!0, 0xffff_3ffe, !0, !0, !0, !0, !0, !0]; | |
527 _encode(&twobytes, &onebyte, dest, src, false) | |
528 } | |
529 | |
530 fn hash_mangle(src: &[u8], sha: &[u8]) -> Vec<u8> { | |
531 let dirprefixlen = 8; | |
532 let maxshortdirslen = 68; | |
533 let mut destlen = 0; | |
534 | |
535 let last_slash = src.iter().rposition(|b| *b == b'/'); | |
536 let last_dot: Option<usize> = { | |
537 let s = last_slash.unwrap_or(0); | |
538 src[s..] | |
539 .iter() | |
540 .rposition(|b| *b == b'.') | |
541 .and_then(|i| Some(i + s)) | |
542 }; | |
543 | |
544 let mut dest = vec![0; MAXSTOREPATHLEN]; | |
545 memcopy(Some(&mut dest), &mut destlen, b"dh/"); | |
546 | |
547 { | |
548 let mut first = true; | |
549 for slice in src[..last_slash.unwrap_or_else(|| src.len())] | |
550 .split(|b| *b == b'/') | |
551 { | |
552 let slice = &slice[..std::cmp::min(slice.len(), dirprefixlen)]; | |
553 if destlen + (slice.len() + if first { 0 } else { 1 }) | |
554 > maxshortdirslen + 3 | |
555 { | |
556 break; | |
557 } else { | |
558 if !first { | |
559 charcopy(Some(&mut dest), &mut destlen, b'/') | |
560 }; | |
561 memcopy(Some(&mut dest), &mut destlen, slice); | |
562 if dest[destlen - 1] == b'.' || dest[destlen - 1] == b' ' { | |
563 dest[destlen - 1] = b'_' | |
564 } | |
565 } | |
566 first = false; | |
567 } | |
568 if !first { | |
569 charcopy(Some(&mut dest), &mut destlen, b'/'); | |
570 } | |
571 } | |
572 | |
573 let used = destlen + 40 + { | |
574 if let Some(l) = last_dot { | |
575 src.len() - l | |
576 } else { | |
577 0 | |
578 } | |
579 }; | |
580 | |
581 if MAXSTOREPATHLEN > used { | |
582 let slop = MAXSTOREPATHLEN - used; | |
583 let basenamelen = match last_slash { | |
584 Some(l) => src.len() - l - 1, | |
585 None => src.len(), | |
586 }; | |
587 let basenamelen = std::cmp::min(basenamelen, slop); | |
588 if basenamelen > 0 { | |
589 let start = match last_slash { | |
590 Some(l) => l + 1, | |
591 None => 0, | |
592 }; | |
593 memcopy( | |
594 Some(&mut dest), | |
595 &mut destlen, | |
596 &src[start..][..basenamelen], | |
597 ) | |
598 } | |
599 } | |
600 for c in sha { | |
601 hexencode(Some(&mut dest), &mut destlen, *c); | |
602 } | |
603 if let Some(l) = last_dot { | |
604 memcopy(Some(&mut dest), &mut destlen, &src[l..]); | |
605 } | |
606 if destlen == dest.len() { | |
607 dest | |
608 } else { | |
609 // sometimes the path are shorter than MAXSTOREPATHLEN | |
610 dest[..destlen].to_vec() | |
611 } | |
612 } | |
613 | |
614 const MAXENCODE: usize = 4096 * 4; | |
615 fn hash_encode(src: &[u8]) -> Vec<u8> { | |
616 let dired = &mut [0; MAXENCODE]; | |
617 let lowered = &mut [0; MAXENCODE]; | |
618 let auxed = &mut [0; MAXENCODE]; | |
619 let baselen = (src.len() - 5) * 3; | |
620 if baselen >= MAXENCODE { | |
621 panic!("path_encode::hash_encore: string too long: {}", baselen) | |
622 }; | |
623 let dirlen = encode_dir(Some(&mut dired[..]), src); | |
624 let sha = { | |
625 let mut hasher = Sha1::new(); | |
626 hasher.input(&dired[..dirlen]); | |
627 let mut hash = vec![0; 20]; | |
628 hasher.result(&mut hash); | |
629 hash | |
630 }; | |
631 let lowerlen = lower_encode(Some(&mut lowered[..]), &dired[..dirlen][5..]); | |
632 let auxlen = aux_encode(Some(&mut auxed[..]), &lowered[..lowerlen]); | |
633 hash_mangle(&auxed[..auxlen], &sha) | |
634 } | |
635 | |
636 pub fn path_encode(path: &[u8]) -> Vec<u8> { | |
637 let newlen = if path.len() <= MAXSTOREPATHLEN { | |
638 basic_encode(None, path) | |
639 } else { | |
640 MAXSTOREPATHLEN + 1 | |
641 }; | |
642 if newlen <= MAXSTOREPATHLEN { | |
643 if newlen == path.len() { | |
644 path.to_vec() | |
645 } else { | |
646 let mut res = vec![0; newlen]; | |
647 basic_encode(Some(&mut res), path); | |
648 res | |
649 } | |
650 } else { | |
651 hash_encode(&path) | |
652 } | |
653 } |