changeset 33926:f4433f2713d0

encoding: add function to test if a str consists of ASCII characters Most strings are ASCII. Let's optimize for it. Using uint64_t is slightly faster than uint32_t on 64bit system, but there isn't huge difference.
author Yuya Nishihara <yuya@tcha.org>
date Sun, 23 Apr 2017 12:59:42 +0900
parents 2c37f9dabc32
children 853574db5b12
files contrib/python3-whitelist mercurial/cext/charencode.c mercurial/cext/charencode.h mercurial/cext/parsers.c mercurial/compat.h mercurial/encoding.py mercurial/policy.py mercurial/pure/charencode.py tests/test-encoding-func.py
diffstat 9 files changed, 72 insertions(+), 2 deletions(-) [+]
line wrap: on
line diff
--- a/contrib/python3-whitelist	Sun Apr 23 14:47:52 2017 +0900
+++ b/contrib/python3-whitelist	Sun Apr 23 12:59:42 2017 +0900
@@ -18,6 +18,7 @@
 test-duplicateoptions.py
 test-empty-dir.t
 test-empty.t
+test-encoding-func.py
 test-excessive-merge.t
 test-hghave.t
 test-imports-checker.t
--- a/mercurial/cext/charencode.c	Sun Apr 23 14:47:52 2017 +0900
+++ b/mercurial/cext/charencode.c	Sun Apr 23 12:59:42 2017 +0900
@@ -12,6 +12,7 @@
 #include <assert.h>
 
 #include "charencode.h"
+#include "compat.h"
 #include "util.h"
 
 #ifdef IS_PY3K
@@ -125,6 +126,29 @@
 	return ret;
 }
 
+PyObject *isasciistr(PyObject *self, PyObject *args)
+{
+	const char *buf;
+	Py_ssize_t i, len;
+	if (!PyArg_ParseTuple(args, "s#:isasciistr", &buf, &len))
+		return NULL;
+	i = 0;
+	/* char array in PyStringObject should be at least 4-byte aligned */
+	if (((uintptr_t)buf & 3) == 0) {
+		const uint32_t *p = (const uint32_t *)buf;
+		for (; i < len / 4; i++) {
+			if (p[i] & 0x80808080U)
+				Py_RETURN_FALSE;
+		}
+		i *= 4;
+	}
+	for (; i < len; i++) {
+		if (buf[i] & 0x80)
+			Py_RETURN_FALSE;
+	}
+	Py_RETURN_TRUE;
+}
+
 static inline PyObject *_asciitransform(PyObject *str_obj,
 					const char table[128],
 					PyObject *fallback_fn)
--- a/mercurial/cext/charencode.h	Sun Apr 23 14:47:52 2017 +0900
+++ b/mercurial/cext/charencode.h	Sun Apr 23 12:59:42 2017 +0900
@@ -19,6 +19,7 @@
 };
 
 PyObject *unhexlify(const char *str, Py_ssize_t len);
+PyObject *isasciistr(PyObject *self, PyObject *args);
 PyObject *asciilower(PyObject *self, PyObject *args);
 PyObject *asciiupper(PyObject *self, PyObject *args);
 PyObject *make_file_foldmap(PyObject *self, PyObject *args);
--- a/mercurial/cext/parsers.c	Sun Apr 23 14:47:52 2017 +0900
+++ b/mercurial/cext/parsers.c	Sun Apr 23 12:59:42 2017 +0900
@@ -696,6 +696,7 @@
 	{"parse_manifest", parse_manifest, METH_VARARGS, "parse a manifest\n"},
 	{"parse_dirstate", parse_dirstate, METH_VARARGS, "parse a dirstate\n"},
 	{"parse_index2", parse_index2, METH_VARARGS, "parse a revlog index\n"},
+	{"isasciistr", isasciistr, METH_VARARGS, "check if an ASCII string\n"},
 	{"asciilower", asciilower, METH_VARARGS, "lowercase an ASCII string\n"},
 	{"asciiupper", asciiupper, METH_VARARGS, "uppercase an ASCII string\n"},
 	{"dict_new_presized", dict_new_presized, METH_VARARGS,
@@ -716,7 +717,7 @@
 void manifest_module_init(PyObject *mod);
 void revlog_module_init(PyObject *mod);
 
-static const int version = 2;
+static const int version = 3;
 
 static void module_init(PyObject *mod)
 {
--- a/mercurial/compat.h	Sun Apr 23 14:47:52 2017 +0900
+++ b/mercurial/compat.h	Sun Apr 23 12:59:42 2017 +0900
@@ -7,8 +7,10 @@
 #define inline __inline
 #if defined(_WIN64)
 typedef __int64 ssize_t;
+typedef unsigned __int64 uintptr_t;
 #else
 typedef int ssize_t;
+typedef unsigned int uintptr_t;
 #endif
 typedef signed char int8_t;
 typedef short int16_t;
--- a/mercurial/encoding.py	Sun Apr 23 14:47:52 2017 +0900
+++ b/mercurial/encoding.py	Sun Apr 23 12:59:42 2017 +0900
@@ -24,6 +24,7 @@
 
 charencode = policy.importmod(r'charencode')
 
+isasciistr = charencode.isasciistr
 asciilower = charencode.asciilower
 asciiupper = charencode.asciiupper
 _jsonescapeu8fast = charencode.jsonescapeu8fast
--- a/mercurial/policy.py	Sun Apr 23 14:47:52 2017 +0900
+++ b/mercurial/policy.py	Sun Apr 23 12:59:42 2017 +0900
@@ -75,7 +75,7 @@
     (r'cext', r'diffhelpers'): 1,
     (r'cext', r'mpatch'): 1,
     (r'cext', r'osutil'): 1,
-    (r'cext', r'parsers'): 2,
+    (r'cext', r'parsers'): 3,
 }
 
 # map import request to other package or module
--- a/mercurial/pure/charencode.py	Sun Apr 23 14:47:52 2017 +0900
+++ b/mercurial/pure/charencode.py	Sun Apr 23 12:59:42 2017 +0900
@@ -13,6 +13,13 @@
     pycompat,
 )
 
+def isasciistr(s):
+    try:
+        s.decode('ascii')
+        return True
+    except UnicodeDecodeError:
+        return False
+
 def asciilower(s):
     '''convert a string to lowercase if ASCII
 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/test-encoding-func.py	Sun Apr 23 12:59:42 2017 +0900
@@ -0,0 +1,33 @@
+from __future__ import absolute_import
+
+import unittest
+
+from mercurial import (
+    encoding,
+)
+
+class IsasciistrTest(unittest.TestCase):
+    asciistrs = [
+        b'a',
+        b'ab',
+        b'abc',
+        b'abcd',
+        b'abcde',
+        b'abcdefghi',
+        b'abcd\0fghi',
+    ]
+
+    def testascii(self):
+        for s in self.asciistrs:
+            self.assertTrue(encoding.isasciistr(s))
+
+    def testnonasciichar(self):
+        for s in self.asciistrs:
+            for i in range(len(s)):
+                t = bytearray(s)
+                t[i] |= 0x80
+                self.assertFalse(encoding.isasciistr(bytes(t)))
+
+if __name__ == '__main__':
+    import silenttestrunner
+    silenttestrunner.main(__name__)