diff options
author | 2016-09-04 17:55:23 +0300 | |
---|---|---|
committer | 2016-09-04 17:55:23 +0300 | |
commit | 09a6f823eae50b5b9e50a15b5a55632adc4d31fe (patch) | |
tree | 63e6d74fe9281e8d06a8a7420b00f1023c1d974e | |
parent | Issue #2389: the custom error handler may return a 'pos' that is smaller (diff) | |
download | pypy-09a6f823eae50b5b9e50a15b5a55632adc4d31fe.tar.gz pypy-09a6f823eae50b5b9e50a15b5a55632adc4d31fe.tar.bz2 pypy-09a6f823eae50b5b9e50a15b5a55632adc4d31fe.zip |
Move the bit checking inside helpers, share it from the two places
(grafted from ee3a2fbec01afa109be9414e105ea7250a7e1b24)
-rw-r--r-- | rpython/rlib/runicode.py | 48 |
1 files changed, 28 insertions, 20 deletions
diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py index 17aac5e9f5..23f965cd53 100644 --- a/rpython/rlib/runicode.py +++ b/rpython/rlib/runicode.py @@ -137,6 +137,25 @@ def str_decode_utf_8(s, size, errors, final=False, result=result) return result.build(), pos +def _invalid_cont_byte(ordch): + return ordch>>6 != 0x2 # 0b10 + +_invalid_byte_2_of_2 = _invalid_cont_byte +_invalid_byte_3_of_3 = _invalid_cont_byte +_invalid_byte_3_of_4 = _invalid_cont_byte +_invalid_byte_4_of_4 = _invalid_cont_byte + +def _invalid_byte_2_of_3(ordch1, ordch2, allow_surrogates): + return (ordch2>>6 != 0x2 or # 0b10 + (ordch1 == 0xe0 and ordch2 < 0xa0) + # surrogates shouldn't be valid UTF-8! + or (not allow_surrogates and ordch1 == 0xed and ordch2 > 0x9f)) + +def _invalid_byte_2_of_4(ordch1, ordch2): + return (ordch2>>6 != 0x2 or # 0b10 + (ordch1 == 0xf0 and ordch2 < 0x90) or + (ordch1 == 0xf4 and ordch2 > 0x8f)) + @specialize.argtype(6) def str_decode_utf_8_impl(s, size, errors, final, errorhandler, allow_surrogates, result): @@ -173,10 +192,7 @@ def str_decode_utf_8_impl(s, size, errors, final, errorhandler, ordch2 = ord(s[pos+1]) if n == 3: # 3-bytes seq with only a continuation byte - if (ordch2>>6 != 0x2 or # 0b10 - (ordch1 == 0xe0 and ordch2 < 0xa0) - or (not allow_surrogates and ordch1 == 0xed and ordch2 > 0x9f) - ): + if _invalid_byte_2_of_3(ordch1, ordch2, allow_surrogates): # second byte invalid, take the first and continue r, pos = errorhandler(errors, 'utf8', 'invalid continuation byte', @@ -192,16 +208,14 @@ def str_decode_utf_8_impl(s, size, errors, final, errorhandler, continue elif n == 4: # 4-bytes seq with 1 or 2 continuation bytes - if (ordch2>>6 != 0x2 or # 0b10 - (ordch1 == 0xf0 and ordch2 < 0x90) or - (ordch1 == 0xf4 and ordch2 > 0x8f)): + if _invalid_byte_2_of_4(ordch1, ordch2): # second byte invalid, take the first and continue r, pos = errorhandler(errors, 'utf8', 'invalid continuation byte', s, pos, pos+1) result.append(r) continue - elif charsleft == 2 and ord(s[pos+2])>>6 != 0x2: # 0b10 + elif charsleft == 2 and _invalid_byte_3_of_4(ord(s[pos+2])): # third byte invalid, take the first two and continue r, pos = errorhandler(errors, 'utf8', 'invalid continuation byte', @@ -228,7 +242,7 @@ def str_decode_utf_8_impl(s, size, errors, final, errorhandler, elif n == 2: ordch2 = ord(s[pos+1]) - if ordch2>>6 != 0x2: # 0b10 + if _invalid_byte_2_of_2(ordch2): r, pos = errorhandler(errors, 'utf8', 'invalid continuation byte', s, pos, pos+1) @@ -242,17 +256,13 @@ def str_decode_utf_8_impl(s, size, errors, final, errorhandler, elif n == 3: ordch2 = ord(s[pos+1]) ordch3 = ord(s[pos+2]) - if (ordch2>>6 != 0x2 or # 0b10 - (ordch1 == 0xe0 and ordch2 < 0xa0) - # surrogates shouldn't be valid UTF-8! - or (not allow_surrogates and ordch1 == 0xed and ordch2 > 0x9f) - ): + if _invalid_byte_2_of_3(ordch1, ordch2, allow_surrogates): r, pos = errorhandler(errors, 'utf8', 'invalid continuation byte', s, pos, pos+1) result.append(r) continue - elif ordch3>>6 != 0x2: # 0b10 + elif _invalid_byte_3_of_3(ordch3): r, pos = errorhandler(errors, 'utf8', 'invalid continuation byte', s, pos, pos+2) @@ -268,21 +278,19 @@ def str_decode_utf_8_impl(s, size, errors, final, errorhandler, ordch2 = ord(s[pos+1]) ordch3 = ord(s[pos+2]) ordch4 = ord(s[pos+3]) - if (ordch2>>6 != 0x2 or # 0b10 - (ordch1 == 0xf0 and ordch2 < 0x90) or - (ordch1 == 0xf4 and ordch2 > 0x8f)): + if _invalid_byte_2_of_4(ordch1, ordch2): r, pos = errorhandler(errors, 'utf8', 'invalid continuation byte', s, pos, pos+1) result.append(r) continue - elif ordch3>>6 != 0x2: # 0b10 + elif _invalid_byte_3_of_4(ordch3): r, pos = errorhandler(errors, 'utf8', 'invalid continuation byte', s, pos, pos+2) result.append(r) continue - elif ordch4>>6 != 0x2: # 0b10 + elif _invalid_byte_4_of_4(ordch4): r, pos = errorhandler(errors, 'utf8', 'invalid continuation byte', s, pos, pos+3) |