comparison rust/hg-core/src/revlog/path_encode.rs @ 45539:aebc976fd7d5

hg-core: add path_encode Differential Revision: https://phab.mercurial-scm.org/D9049
author Antoine Cezar <antoine.cezar@octobus.net>
date Tue, 15 Sep 2020 16:45:27 +0200
parents
children fad504cfc94b
comparison
equal deleted inserted replaced
45538:2d5dfc8fed55 45539:aebc976fd7d5
1 use crypto::digest::Digest;
2 use crypto::sha1::Sha1;
3
4 #[derive(PartialEq, Debug)]
5 #[allow(non_camel_case_types)]
6 enum path_state {
7 START, /* first byte of a path component */
8 A, /* "AUX" */
9 AU,
10 THIRD, /* third of a 3-byte sequence, e.g. "AUX", "NUL" */
11 C, /* "CON" or "COMn" */
12 CO,
13 COMLPT, /* "COM" or "LPT" */
14 COMLPTn,
15 L,
16 LP,
17 N,
18 NU,
19 P, /* "PRN" */
20 PR,
21 LDOT, /* leading '.' */
22 DOT, /* '.' in a non-leading position */
23 H, /* ".h" */
24 HGDI, /* ".hg", ".d", or ".i" */
25 SPACE,
26 DEFAULT, /* byte of a path component after the first */
27 }
28
29 /* state machine for dir-encoding */
30 #[allow(non_camel_case_types)]
31 enum dir_state {
32 DDOT,
33 DH,
34 DHGDI,
35 DDEFAULT,
36 }
37
38 fn inset(bitset: &[u32; 8], c: u8) -> bool {
39 bitset[(c as usize) >> 5] & (1 << (c & 31)) != 0
40 }
41
42 fn charcopy(dest: Option<&mut [u8]>, destlen: &mut usize, c: u8) {
43 if let Some(slice) = dest {
44 slice[*destlen] = c
45 }
46 *destlen += 1
47 }
48
49 fn memcopy(dest: Option<&mut [u8]>, destlen: &mut usize, src: &[u8]) {
50 if let Some(slice) = dest {
51 slice[*destlen..*destlen + src.len()].copy_from_slice(src)
52 }
53 *destlen += src.len();
54 }
55
56 fn rewrap_option<'a, 'b: 'a>(
57 x: &'a mut Option<&'b mut [u8]>,
58 ) -> Option<&'a mut [u8]> {
59 match x {
60 None => None,
61 Some(y) => Some(y),
62 }
63 }
64
65 fn hexencode<'a>(mut dest: Option<&'a mut [u8]>, destlen: &mut usize, c: u8) {
66 let hexdigit = b"0123456789abcdef";
67 charcopy(
68 rewrap_option(&mut dest),
69 destlen,
70 hexdigit[(c as usize) >> 4],
71 );
72 charcopy(dest, destlen, hexdigit[(c as usize) & 15]);
73 }
74
75 /* 3-byte escape: tilde followed by two hex digits */
76 fn escape3(mut dest: Option<&mut [u8]>, destlen: &mut usize, c: u8) {
77 charcopy(rewrap_option(&mut dest), destlen, b'~');
78 hexencode(dest, destlen, c);
79 }
80
81 fn encode_dir(mut dest: Option<&mut [u8]>, src: &[u8]) -> usize {
82 let mut state = dir_state::DDEFAULT;
83 let mut i = 0;
84 let mut destlen = 0;
85
86 while i < src.len() {
87 match state {
88 dir_state::DDOT => match src[i] {
89 b'd' | b'i' => {
90 state = dir_state::DHGDI;
91 charcopy(rewrap_option(&mut dest), &mut destlen, src[i]);
92 i += 1;
93 }
94 b'h' => {
95 state = dir_state::DH;
96 charcopy(rewrap_option(&mut dest), &mut destlen, src[i]);
97 i += 1;
98 }
99 _ => {
100 state = dir_state::DDEFAULT;
101 }
102 },
103 dir_state::DH => {
104 if src[i] == b'g' {
105 state = dir_state::DHGDI;
106 charcopy(rewrap_option(&mut dest), &mut destlen, src[i]);
107 i += 1;
108 } else {
109 state = dir_state::DDEFAULT;
110 }
111 }
112 dir_state::DHGDI => {
113 if src[i] == b'/' {
114 memcopy(rewrap_option(&mut dest), &mut destlen, b".hg");
115 charcopy(rewrap_option(&mut dest), &mut destlen, src[i]);
116 i += 1;
117 }
118 state = dir_state::DDEFAULT;
119 }
120 dir_state::DDEFAULT => {
121 if src[i] == b'.' {
122 state = dir_state::DDOT
123 }
124 charcopy(rewrap_option(&mut dest), &mut destlen, src[i]);
125 i += 1;
126 }
127 }
128 }
129 destlen
130 }
131
132 fn _encode(
133 twobytes: &[u32; 8],
134 onebyte: &[u32; 8],
135 mut dest: Option<&mut [u8]>,
136 src: &[u8],
137 encodedir: bool,
138 ) -> usize {
139 let mut state = path_state::START;
140 let mut i = 0;
141 let mut destlen = 0;
142 let len = src.len();
143
144 while i < len {
145 match state {
146 path_state::START => match src[i] {
147 b'/' => {
148 charcopy(rewrap_option(&mut dest), &mut destlen, src[i]);
149 i += 1;
150 }
151 b'.' => {
152 state = path_state::LDOT;
153 escape3(rewrap_option(&mut dest), &mut destlen, src[i]);
154 i += 1;
155 }
156 b' ' => {
157 state = path_state::DEFAULT;
158 escape3(rewrap_option(&mut dest), &mut destlen, src[i]);
159 i += 1;
160 }
161 b'a' => {
162 state = path_state::A;
163 charcopy(rewrap_option(&mut dest), &mut destlen, src[i]);
164 i += 1;
165 }
166 b'c' => {
167 state = path_state::C;
168 charcopy(rewrap_option(&mut dest), &mut destlen, src[i]);
169 i += 1;
170 }
171 b'l' => {
172 state = path_state::L;
173 charcopy(rewrap_option(&mut dest), &mut destlen, src[i]);
174 i += 1;
175 }
176 b'n' => {
177 state = path_state::N;
178 charcopy(rewrap_option(&mut dest), &mut destlen, src[i]);
179 i += 1;
180 }
181 b'p' => {
182 state = path_state::P;
183 charcopy(rewrap_option(&mut dest), &mut destlen, src[i]);
184 i += 1;
185 }
186 _ => {
187 state = path_state::DEFAULT;
188 }
189 },
190 path_state::A => {
191 if src[i] == b'u' {
192 state = path_state::AU;
193 charcopy(rewrap_option(&mut dest), &mut destlen, src[i]);
194 i += 1;
195 } else {
196 state = path_state::DEFAULT;
197 }
198 }
199 path_state::AU => {
200 if src[i] == b'x' {
201 state = path_state::THIRD;
202 i += 1;
203 } else {
204 state = path_state::DEFAULT;
205 }
206 }
207 path_state::THIRD => {
208 state = path_state::DEFAULT;
209 match src[i] {
210 b'.' | b'/' | b'\0' => escape3(
211 rewrap_option(&mut dest),
212 &mut destlen,
213 src[i - 1],
214 ),
215 _ => i -= 1,
216 }
217 }
218 path_state::C => {
219 if src[i] == b'o' {
220 state = path_state::CO;
221 charcopy(rewrap_option(&mut dest), &mut destlen, src[i]);
222 i += 1;
223 } else {
224 state = path_state::DEFAULT;
225 }
226 }
227 path_state::CO => {
228 if src[i] == b'm' {
229 state = path_state::COMLPT;
230 i += 1;
231 } else if src[i] == b'n' {
232 state = path_state::THIRD;
233 i += 1;
234 } else {
235 state = path_state::DEFAULT;
236 }
237 }
238 path_state::COMLPT => {
239 if src[i] >= b'1' && src[i] <= b'9' {
240 state = path_state::COMLPTn;
241 i += 1;
242 } else {
243 state = path_state::DEFAULT;
244 charcopy(
245 rewrap_option(&mut dest),
246 &mut destlen,
247 src[i - 1],
248 );
249 }
250 }
251 path_state::COMLPTn => {
252 state = path_state::DEFAULT;
253 match src[i] {
254 b'.' | b'/' | b'\0' => {
255 escape3(
256 rewrap_option(&mut dest),
257 &mut destlen,
258 src[i - 2],
259 );
260 charcopy(
261 rewrap_option(&mut dest),
262 &mut destlen,
263 src[i - 1],
264 );
265 }
266 _ => {
267 memcopy(
268 rewrap_option(&mut dest),
269 &mut destlen,
270 &src[i - 2..i],
271 );
272 }
273 }
274 }
275 path_state::L => {
276 if src[i] == b'p' {
277 state = path_state::LP;
278 charcopy(rewrap_option(&mut dest), &mut destlen, src[i]);
279 i += 1;
280 } else {
281 state = path_state::DEFAULT;
282 }
283 }
284 path_state::LP => {
285 if src[i] == b't' {
286 state = path_state::COMLPT;
287 i += 1;
288 } else {
289 state = path_state::DEFAULT;
290 }
291 }
292 path_state::N => {
293 if src[i] == b'u' {
294 state = path_state::NU;
295 charcopy(rewrap_option(&mut dest), &mut destlen, src[i]);
296 i += 1;
297 } else {
298 state = path_state::DEFAULT;
299 }
300 }
301 path_state::NU => {
302 if src[i] == b'l' {
303 state = path_state::THIRD;
304 i += 1;
305 } else {
306 state = path_state::DEFAULT;
307 }
308 }
309 path_state::P => {
310 if src[i] == b'r' {
311 state = path_state::PR;
312 charcopy(rewrap_option(&mut dest), &mut destlen, src[i]);
313 i += 1;
314 } else {
315 state = path_state::DEFAULT;
316 }
317 }
318 path_state::PR => {
319 if src[i] == b'n' {
320 state = path_state::THIRD;
321 i += 1;
322 } else {
323 state = path_state::DEFAULT;
324 }
325 }
326 path_state::LDOT => match src[i] {
327 b'd' | b'i' => {
328 state = path_state::HGDI;
329 charcopy(rewrap_option(&mut dest), &mut destlen, src[i]);
330 i += 1;
331 }
332 b'h' => {
333 state = path_state::H;
334 charcopy(rewrap_option(&mut dest), &mut destlen, src[i]);
335 i += 1;
336 }
337 _ => {
338 state = path_state::DEFAULT;
339 }
340 },
341 path_state::DOT => match src[i] {
342 b'/' | b'\0' => {
343 state = path_state::START;
344 memcopy(rewrap_option(&mut dest), &mut destlen, b"~2e");
345 charcopy(rewrap_option(&mut dest), &mut destlen, src[i]);
346 i += 1;
347 }
348 b'd' | b'i' => {
349 state = path_state::HGDI;
350 charcopy(rewrap_option(&mut dest), &mut destlen, b'.');
351 charcopy(rewrap_option(&mut dest), &mut destlen, src[i]);
352 i += 1;
353 }
354 b'h' => {
355 state = path_state::H;
356 memcopy(rewrap_option(&mut dest), &mut destlen, b".h");
357 i += 1;
358 }
359 _ => {
360 state = path_state::DEFAULT;
361 charcopy(rewrap_option(&mut dest), &mut destlen, b'.');
362 }
363 },
364 path_state::H => {
365 if src[i] == b'g' {
366 state = path_state::HGDI;
367 charcopy(rewrap_option(&mut dest), &mut destlen, src[i]);
368 i += 1;
369 } else {
370 state = path_state::DEFAULT;
371 }
372 }
373 path_state::HGDI => {
374 if src[i] == b'/' {
375 state = path_state::START;
376 if encodedir {
377 memcopy(
378 rewrap_option(&mut dest),
379 &mut destlen,
380 b".hg",
381 );
382 }
383 charcopy(rewrap_option(&mut dest), &mut destlen, src[i]);
384 i += 1
385 } else {
386 state = path_state::DEFAULT;
387 }
388 }
389 path_state::SPACE => match src[i] {
390 b'/' | b'\0' => {
391 state = path_state::START;
392 memcopy(rewrap_option(&mut dest), &mut destlen, b"~20");
393 charcopy(rewrap_option(&mut dest), &mut destlen, src[i]);
394 i += 1;
395 }
396 _ => {
397 state = path_state::DEFAULT;
398 charcopy(rewrap_option(&mut dest), &mut destlen, b' ');
399 }
400 },
401 path_state::DEFAULT => {
402 while i != len && inset(onebyte, src[i]) {
403 charcopy(rewrap_option(&mut dest), &mut destlen, src[i]);
404 i += 1;
405 }
406 if i == len {
407 break;
408 }
409 match src[i] {
410 b'.' => {
411 state = path_state::DOT;
412 i += 1
413 }
414 b' ' => {
415 state = path_state::SPACE;
416 i += 1
417 }
418 b'/' => {
419 state = path_state::START;
420 charcopy(rewrap_option(&mut dest), &mut destlen, b'/');
421 i += 1;
422 }
423 _ => {
424 if inset(onebyte, src[i]) {
425 loop {
426 charcopy(
427 rewrap_option(&mut dest),
428 &mut destlen,
429 src[i],
430 );
431 i += 1;
432 if !(i < len && inset(onebyte, src[i])) {
433 break;
434 }
435 }
436 } else if inset(twobytes, src[i]) {
437 let c = src[i];
438 i += 1;
439 charcopy(
440 rewrap_option(&mut dest),
441 &mut destlen,
442 b'_',
443 );
444 charcopy(
445 rewrap_option(&mut dest),
446 &mut destlen,
447 if c == b'_' { b'_' } else { c + 32 },
448 );
449 } else {
450 escape3(
451 rewrap_option(&mut dest),
452 &mut destlen,
453 src[i],
454 );
455 i += 1;
456 }
457 }
458 }
459 }
460 }
461 }
462 match state {
463 path_state::START => (),
464 path_state::A => (),
465 path_state::AU => (),
466 path_state::THIRD => {
467 escape3(rewrap_option(&mut dest), &mut destlen, src[i - 1])
468 }
469 path_state::C => (),
470 path_state::CO => (),
471 path_state::COMLPT => {
472 charcopy(rewrap_option(&mut dest), &mut destlen, src[i - 1])
473 }
474 path_state::COMLPTn => {
475 escape3(rewrap_option(&mut dest), &mut destlen, src[i - 2]);
476 charcopy(rewrap_option(&mut dest), &mut destlen, src[i - 1]);
477 }
478 path_state::L => (),
479 path_state::LP => (),
480 path_state::N => (),
481 path_state::NU => (),
482 path_state::P => (),
483 path_state::PR => (),
484 path_state::LDOT => (),
485 path_state::DOT => {
486 memcopy(rewrap_option(&mut dest), &mut destlen, b"~2e");
487 }
488 path_state::H => (),
489 path_state::HGDI => (),
490 path_state::SPACE => {
491 memcopy(rewrap_option(&mut dest), &mut destlen, b"~20");
492 }
493 path_state::DEFAULT => (),
494 };
495 destlen
496 }
497
498 fn basic_encode(dest: Option<&mut [u8]>, src: &[u8]) -> usize {
499 let twobytes: [u32; 8] = [0, 0, 0x87ff_fffe, 0, 0, 0, 0, 0];
500 let onebyte: [u32; 8] =
501 [1, 0x2bff_3bfa, 0x6800_0001, 0x2fff_ffff, 0, 0, 0, 0];
502 _encode(&twobytes, &onebyte, dest, src, true)
503 }
504
505 const MAXSTOREPATHLEN: usize = 120;
506
507 fn lower_encode(mut dest: Option<&mut [u8]>, src: &[u8]) -> usize {
508 let onebyte: [u32; 8] =
509 [1, 0x2bff_fbfb, 0xe800_0001, 0x2fff_ffff, 0, 0, 0, 0];
510 let lower: [u32; 8] = [0, 0, 0x07ff_fffe, 0, 0, 0, 0, 0];
511 let mut destlen = 0;
512 for c in src {
513 if inset(&onebyte, *c) {
514 charcopy(rewrap_option(&mut dest), &mut destlen, *c)
515 } else if inset(&lower, *c) {
516 charcopy(rewrap_option(&mut dest), &mut destlen, *c + 32)
517 } else {
518 escape3(rewrap_option(&mut dest), &mut destlen, *c)
519 }
520 }
521 destlen
522 }
523
524 fn aux_encode(dest: Option<&mut [u8]>, src: &[u8]) -> usize {
525 let twobytes = [0; 8];
526 let onebyte: [u32; 8] = [!0, 0xffff_3ffe, !0, !0, !0, !0, !0, !0];
527 _encode(&twobytes, &onebyte, dest, src, false)
528 }
529
530 fn hash_mangle(src: &[u8], sha: &[u8]) -> Vec<u8> {
531 let dirprefixlen = 8;
532 let maxshortdirslen = 68;
533 let mut destlen = 0;
534
535 let last_slash = src.iter().rposition(|b| *b == b'/');
536 let last_dot: Option<usize> = {
537 let s = last_slash.unwrap_or(0);
538 src[s..]
539 .iter()
540 .rposition(|b| *b == b'.')
541 .and_then(|i| Some(i + s))
542 };
543
544 let mut dest = vec![0; MAXSTOREPATHLEN];
545 memcopy(Some(&mut dest), &mut destlen, b"dh/");
546
547 {
548 let mut first = true;
549 for slice in src[..last_slash.unwrap_or_else(|| src.len())]
550 .split(|b| *b == b'/')
551 {
552 let slice = &slice[..std::cmp::min(slice.len(), dirprefixlen)];
553 if destlen + (slice.len() + if first { 0 } else { 1 })
554 > maxshortdirslen + 3
555 {
556 break;
557 } else {
558 if !first {
559 charcopy(Some(&mut dest), &mut destlen, b'/')
560 };
561 memcopy(Some(&mut dest), &mut destlen, slice);
562 if dest[destlen - 1] == b'.' || dest[destlen - 1] == b' ' {
563 dest[destlen - 1] = b'_'
564 }
565 }
566 first = false;
567 }
568 if !first {
569 charcopy(Some(&mut dest), &mut destlen, b'/');
570 }
571 }
572
573 let used = destlen + 40 + {
574 if let Some(l) = last_dot {
575 src.len() - l
576 } else {
577 0
578 }
579 };
580
581 if MAXSTOREPATHLEN > used {
582 let slop = MAXSTOREPATHLEN - used;
583 let basenamelen = match last_slash {
584 Some(l) => src.len() - l - 1,
585 None => src.len(),
586 };
587 let basenamelen = std::cmp::min(basenamelen, slop);
588 if basenamelen > 0 {
589 let start = match last_slash {
590 Some(l) => l + 1,
591 None => 0,
592 };
593 memcopy(
594 Some(&mut dest),
595 &mut destlen,
596 &src[start..][..basenamelen],
597 )
598 }
599 }
600 for c in sha {
601 hexencode(Some(&mut dest), &mut destlen, *c);
602 }
603 if let Some(l) = last_dot {
604 memcopy(Some(&mut dest), &mut destlen, &src[l..]);
605 }
606 if destlen == dest.len() {
607 dest
608 } else {
609 // sometimes the path are shorter than MAXSTOREPATHLEN
610 dest[..destlen].to_vec()
611 }
612 }
613
614 const MAXENCODE: usize = 4096 * 4;
615 fn hash_encode(src: &[u8]) -> Vec<u8> {
616 let dired = &mut [0; MAXENCODE];
617 let lowered = &mut [0; MAXENCODE];
618 let auxed = &mut [0; MAXENCODE];
619 let baselen = (src.len() - 5) * 3;
620 if baselen >= MAXENCODE {
621 panic!("path_encode::hash_encore: string too long: {}", baselen)
622 };
623 let dirlen = encode_dir(Some(&mut dired[..]), src);
624 let sha = {
625 let mut hasher = Sha1::new();
626 hasher.input(&dired[..dirlen]);
627 let mut hash = vec![0; 20];
628 hasher.result(&mut hash);
629 hash
630 };
631 let lowerlen = lower_encode(Some(&mut lowered[..]), &dired[..dirlen][5..]);
632 let auxlen = aux_encode(Some(&mut auxed[..]), &lowered[..lowerlen]);
633 hash_mangle(&auxed[..auxlen], &sha)
634 }
635
636 pub fn path_encode(path: &[u8]) -> Vec<u8> {
637 let newlen = if path.len() <= MAXSTOREPATHLEN {
638 basic_encode(None, path)
639 } else {
640 MAXSTOREPATHLEN + 1
641 };
642 if newlen <= MAXSTOREPATHLEN {
643 if newlen == path.len() {
644 path.to_vec()
645 } else {
646 let mut res = vec![0; newlen];
647 basic_encode(Some(&mut res), path);
648 res
649 }
650 } else {
651 hash_encode(&path)
652 }
653 }