Move the bit checking inside helpers, share it from the two places

(grafted from ee3a2fbec01afa109be9414e105ea7250a7e1b24)
author: Matti Picus <matti.picus@gmail.com> 2016-09-04 17:55:23 +0300
committer: Matti Picus <matti.picus@gmail.com> 2016-09-04 17:55:23 +0300
commit: 09a6f823eae50b5b9e50a15b5a55632adc4d31fe (patch)
tree: 63e6d74fe9281e8d06a8a7420b00f1023c1d974e
parent: Issue #2389: the custom error handler may return a 'pos' that is smaller (diff)
download: pypy-09a6f823eae50b5b9e50a15b5a55632adc4d31fe.tar.gz
pypy-09a6f823eae50b5b9e50a15b5a55632adc4d31fe.tar.bz2
pypy-09a6f823eae50b5b9e50a15b5a55632adc4d31fe.zip
1 files changed, 28 insertions, 20 deletions
diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
index 17aac5e9f5..23f965cd53 100644
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -137,6 +137,25 @@ def str_decode_utf_8(s, size, errors, final=False,
                                  result=result)
     return result.build(), pos
 
+def _invalid_cont_byte(ordch):
+    return ordch>>6 != 0x2    # 0b10
+
+_invalid_byte_2_of_2 = _invalid_cont_byte
+_invalid_byte_3_of_3 = _invalid_cont_byte
+_invalid_byte_3_of_4 = _invalid_cont_byte
+_invalid_byte_4_of_4 = _invalid_cont_byte
+
+def _invalid_byte_2_of_3(ordch1, ordch2, allow_surrogates):
+    return (ordch2>>6 != 0x2 or    # 0b10
+            (ordch1 == 0xe0 and ordch2 < 0xa0)
+            # surrogates shouldn't be valid UTF-8!
+            or (not allow_surrogates and ordch1 == 0xed and ordch2 > 0x9f))
+
+def _invalid_byte_2_of_4(ordch1, ordch2):
+    return (ordch2>>6 != 0x2 or    # 0b10
+            (ordch1 == 0xf0 and ordch2 < 0x90) or
+            (ordch1 == 0xf4 and ordch2 > 0x8f))
+
 @specialize.argtype(6)
 def str_decode_utf_8_impl(s, size, errors, final, errorhandler,
                           allow_surrogates, result):
@@ -173,10 +192,7 @@ def str_decode_utf_8_impl(s, size, errors, final, errorhandler,
             ordch2 = ord(s[pos+1])
             if n == 3:
                 # 3-bytes seq with only a continuation byte
-                if (ordch2>>6 != 0x2 or   # 0b10
-                    (ordch1 == 0xe0 and ordch2 < 0xa0)
-                or (not allow_surrogates and ordch1 == 0xed and ordch2 > 0x9f)
-                    ):
+                if _invalid_byte_2_of_3(ordch1, ordch2, allow_surrogates):
                     # second byte invalid, take the first and continue
                     r, pos = errorhandler(errors, 'utf8',
                                           'invalid continuation byte',
@@ -192,16 +208,14 @@ def str_decode_utf_8_impl(s, size, errors, final, errorhandler,
                     continue
             elif n == 4:
                 # 4-bytes seq with 1 or 2 continuation bytes
-                if (ordch2>>6 != 0x2 or    # 0b10
-                    (ordch1 == 0xf0 and ordch2 < 0x90) or
-                    (ordch1 == 0xf4 and ordch2 > 0x8f)):
+                if _invalid_byte_2_of_4(ordch1, ordch2):
                     # second byte invalid, take the first and continue
                     r, pos = errorhandler(errors, 'utf8',
                                           'invalid continuation byte',
                                           s, pos, pos+1)
                     result.append(r)
                     continue
-                elif charsleft == 2 and ord(s[pos+2])>>6 != 0x2:   # 0b10
+                elif charsleft == 2 and _invalid_byte_3_of_4(ord(s[pos+2])):
                     # third byte invalid, take the first two and continue
                     r, pos = errorhandler(errors, 'utf8',
                                           'invalid continuation byte',
@@ -228,7 +242,7 @@ def str_decode_utf_8_impl(s, size, errors, final, errorhandler,
 
         elif n == 2:
             ordch2 = ord(s[pos+1])
-            if ordch2>>6 != 0x2:   # 0b10
+            if _invalid_byte_2_of_2(ordch2):
                 r, pos = errorhandler(errors, 'utf8',
                                       'invalid continuation byte',
                                       s, pos, pos+1)
@@ -242,17 +256,13 @@ def str_decode_utf_8_impl(s, size, errors, final, errorhandler,
         elif n == 3:
             ordch2 = ord(s[pos+1])
             ordch3 = ord(s[pos+2])
-            if (ordch2>>6 != 0x2 or    # 0b10
-                (ordch1 == 0xe0 and ordch2 < 0xa0)
-                # surrogates shouldn't be valid UTF-8!
-                or (not allow_surrogates and ordch1 == 0xed and ordch2 > 0x9f)
-                ):
+            if _invalid_byte_2_of_3(ordch1, ordch2, allow_surrogates):
                 r, pos = errorhandler(errors, 'utf8',
                                       'invalid continuation byte',
                                       s, pos, pos+1)
                 result.append(r)
                 continue
-            elif ordch3>>6 != 0x2:     # 0b10
+            elif _invalid_byte_3_of_3(ordch3):
                 r, pos = errorhandler(errors, 'utf8',
                                       'invalid continuation byte',
                                       s, pos, pos+2)
@@ -268,21 +278,19 @@ def str_decode_utf_8_impl(s, size, errors, final, errorhandler,
             ordch2 = ord(s[pos+1])
             ordch3 = ord(s[pos+2])
             ordch4 = ord(s[pos+3])
-            if (ordch2>>6 != 0x2 or     # 0b10
-                (ordch1 == 0xf0 and ordch2 < 0x90) or
-                (ordch1 == 0xf4 and ordch2 > 0x8f)):
+            if _invalid_byte_2_of_4(ordch1, ordch2):
                 r, pos = errorhandler(errors, 'utf8',
                                       'invalid continuation byte',
                                       s, pos, pos+1)
                 result.append(r)
                 continue
-            elif ordch3>>6 != 0x2:     # 0b10
+            elif _invalid_byte_3_of_4(ordch3):
                 r, pos = errorhandler(errors, 'utf8',
                                       'invalid continuation byte',
                                       s, pos, pos+2)
                 result.append(r)
                 continue
-            elif ordch4>>6 != 0x2:     # 0b10
+            elif _invalid_byte_4_of_4(ordch4):
                 r, pos = errorhandler(errors, 'utf8',
                                       'invalid continuation byte',
                                       s, pos, pos+3)
author	Matti Picus <matti.picus@gmail.com>	2016-09-04 17:55:23 +0300
committer	Matti Picus <matti.picus@gmail.com>	2016-09-04 17:55:23 +0300
commit	09a6f823eae50b5b9e50a15b5a55632adc4d31fe (patch)
tree	63e6d74fe9281e8d06a8a7420b00f1023c1d974e
parent	Issue #2389: the custom error handler may return a 'pos' that is smaller (diff)
download	pypy-09a6f823eae50b5b9e50a15b5a55632adc4d31fe.tar.gz pypy-09a6f823eae50b5b9e50a15b5a55632adc4d31fe.tar.bz2 pypy-09a6f823eae50b5b9e50a15b5a55632adc4d31fe.zip