Mercurial > hg
changeset 15726:9b822edecb4c
i18n: use "encoding.lower()" to normalize specified string for revset
some problematic encoding (e.g.: cp932) uses ASCII alphabet characters
in byte sequence of multi byte characters.
"str.lower()" on such byte sequence may treat distinct characters as
same one, and cause unexpected log matching.
this patch uses "encoding.lower()" instead of "str.lower()" to
normalize strings for compare.
author | FUJIWARA Katsunori <foozy@lares.dti.ne.jp> |
---|---|
date | Sun, 25 Dec 2011 20:35:16 +0900 |
parents | 988409e44a76 |
children | 917f263eeb26 |
files | mercurial/revset.py tests/test-revset.t |
diffstat | 2 files changed, 65 insertions(+), 6 deletions(-) [+] |
line wrap: on
line diff
--- a/mercurial/revset.py Sun Dec 25 20:35:16 2011 +0900 +++ b/mercurial/revset.py Sun Dec 25 20:35:16 2011 +0900 @@ -11,6 +11,7 @@ import bookmarks as bookmarksmod import match as matchmod from i18n import _ +import encoding elements = { "(": (20, ("group", 1, ")"), ("func", 1, ")")), @@ -233,8 +234,8 @@ Alias for ``user(string)``. """ # i18n: "author" is a keyword - n = getstring(x, _("author requires a string")).lower() - return [r for r in subset if n in repo[r].user().lower()] + n = encoding.lower(getstring(x, _("author requires a string"))) + return [r for r in subset if n in encoding.lower(repo[r].user())] def bisect(repo, subset, x): """``bisect(string)`` @@ -376,11 +377,11 @@ Search commit message for string. The match is case-insensitive. """ # i18n: "desc" is a keyword - ds = getstring(x, _("desc requires a string")).lower() + ds = encoding.lower(getstring(x, _("desc requires a string"))) l = [] for r in subset: c = repo[r] - if ds in c.description().lower(): + if ds in encoding.lower(c.description()): l.append(r) return l @@ -522,12 +523,12 @@ string. The match is case-insensitive. """ # i18n: "keyword" is a keyword - kw = getstring(x, _("keyword requires a string")).lower() + kw = encoding.lower(getstring(x, _("keyword requires a string"))) l = [] for r in subset: c = repo[r] t = " ".join(c.files() + [c.user(), c.description()]) - if kw in t.lower(): + if kw in encoding.lower(t): l.append(r) return l
--- a/tests/test-revset.t Sun Dec 25 20:35:16 2011 +0900 +++ b/tests/test-revset.t Sun Dec 25 20:35:16 2011 +0900 @@ -475,3 +475,61 @@ $ log 'max(1 or 2) and not 2' $ log 'min(1 or 2) and not 1' $ log 'last(1 or 2, 1) and not 2' + + $ cd .. + +test author/desc/keyword in problematic encoding +# unicode: cp932: +# u30A2 0x83 0x41(= 'A') +# u30C2 0x83 0x61(= 'a') + + $ hg init problematicencoding + $ cd problematicencoding + + $ python > setup.sh <<EOF + > print u''' + > echo a > text + > hg add text + > hg --encoding utf-8 commit -u '\u30A2' -m none + > echo b > text + > hg --encoding utf-8 commit -u '\u30C2' -m none + > echo c > text + > hg --encoding utf-8 commit -u none -m '\u30A2' + > echo d > text + > hg --encoding utf-8 commit -u none -m '\u30C2' + > '''.encode('utf-8') + > EOF + $ sh < setup.sh + +test in problematic encoding + $ python > test.sh <<EOF + > print u''' + > hg --encoding cp932 log --template '{rev}\\n' -r 'author(\u30A2)' + > echo ==== + > hg --encoding cp932 log --template '{rev}\\n' -r 'author(\u30C2)' + > echo ==== + > hg --encoding cp932 log --template '{rev}\\n' -r 'desc(\u30A2)' + > echo ==== + > hg --encoding cp932 log --template '{rev}\\n' -r 'desc(\u30C2)' + > echo ==== + > hg --encoding cp932 log --template '{rev}\\n' -r 'keyword(\u30A2)' + > echo ==== + > hg --encoding cp932 log --template '{rev}\\n' -r 'keyword(\u30C2)' + > '''.encode('cp932') + > EOF + $ sh < test.sh + 0 + ==== + 1 + ==== + 2 + ==== + 3 + ==== + 0 + 2 + ==== + 1 + 3 + + $ cd ..