cython-devel

changeset 3254:06ea96f9a568

native support for Py_UNICODE, coercion between 1-character unicode literals and Py_UNICODE, fix C iteration over unicode strings by using Py_UNICODE*
author Stefan Behnel <scoder@users.berlios.de>
date Mon Apr 19 09:50:19 2010 +0200 (3 years ago)
parents ac2b6c7aa8ab
children 884b8a136a09
files Cython/Compiler/ExprNodes.py Cython/Compiler/Optimize.py Cython/Compiler/Parsing.py Cython/Compiler/PyrexTypes.py Cython/Shadow.py tests/errors/e_strcoerce.pyx tests/errors/py_unicode_type_errors.pyx tests/errors/string_assignments.pyx tests/run/for_in_string.pyx tests/run/py_unicode_type.pyx
line diff
1.1 --- a/Cython/Compiler/ExprNodes.py Sun Apr 18 23:21:10 2010 +0200 1.2 +++ b/Cython/Compiler/ExprNodes.py Mon Apr 19 09:50:19 2010 +0200 1.3 @@ -860,7 +860,10 @@ 1.4 def coerce_to(self, dst_type, env): 1.5 if dst_type.is_int: 1.6 if not self.can_coerce_to_char_literal(): 1.7 - error(self.pos, "Only single-character strings can be coerced into ints.") 1.8 + error(self.pos, "Only single-character string literals can be coerced into ints.") 1.9 + return self 1.10 + if dst_type is PyrexTypes.c_py_unicode_type: 1.11 + error(self.pos, "Bytes literals cannot coerce to Py_UNICODE, use a unicode literal instead.") 1.12 return self 1.13 return CharNode(self.pos, value=self.value) 1.14 1.15 @@ -915,13 +918,22 @@ 1.16 def coerce_to(self, dst_type, env): 1.17 if dst_type is self.type: 1.18 pass 1.19 + elif dst_type is PyrexTypes.c_py_unicode_type: 1.20 + if not self.can_coerce_to_char_literal(): 1.21 + error(self.pos, "Only single-character Unicode string literals can be coerced into Py_UNICODE.") 1.22 + return self 1.23 + int_value = ord(self.value) 1.24 + return IntNode(self.pos, value=int_value, constant_result=int_value) 1.25 elif not dst_type.is_pyobject: 1.26 - error(self.pos, "Unicode objects do not support coercion to C types.") 1.27 + error(self.pos, "Unicode literals do not support coercion to C types other than Py_UNICODE.") 1.28 elif dst_type is not py_object_type: 1.29 if not self.check_for_coercion_error(dst_type): 1.30 self.fail_assignment(dst_type) 1.31 return self 1.32 1.33 + def can_coerce_to_char_literal(self): 1.34 + return len(self.value) == 1 1.35 + 1.36 def generate_evaluation_code(self, code): 1.37 self.result_code = code.get_py_string_const(self.value) 1.38 1.39 @@ -5426,10 +5438,10 @@ 1.40 type1_can_be_int = False 1.41 type2_can_be_int = False 1.42 1.43 - if isinstance(operand1, (StringNode, BytesNode)) \ 1.44 + if isinstance(operand1, (StringNode, BytesNode, UnicodeNode)) \ 1.45 and operand1.can_coerce_to_char_literal(): 1.46 type1_can_be_int = True 1.47 - if isinstance(operand2, (StringNode, BytesNode)) \ 1.48 + if isinstance(operand2, (StringNode, BytesNode, UnicodeNode)) \ 1.49 and operand2.can_coerce_to_char_literal(): 1.50 type2_can_be_int = True 1.51
2.1 --- a/Cython/Compiler/Optimize.py Sun Apr 18 23:21:10 2010 +0200 2.2 +++ b/Cython/Compiler/Optimize.py Mon Apr 19 09:50:19 2010 +0200 2.3 @@ -137,7 +137,7 @@ 2.4 return node 2.5 2.6 PyUnicode_AS_UNICODE_func_type = PyrexTypes.CFuncType( 2.7 - PyrexTypes.CPtrType(PyrexTypes.c_uint_type), [ # FIXME: return type is actually Py_UNICODE* 2.8 + PyrexTypes.CPtrType(PyrexTypes.c_py_unicode_type), [ 2.9 PyrexTypes.CFuncTypeArg("s", Builtin.unicode_type, None) 2.10 ]) 2.11
3.1 --- a/Cython/Compiler/Parsing.py Sun Apr 18 23:21:10 2010 +0200 3.2 +++ b/Cython/Compiler/Parsing.py Mon Apr 19 09:50:19 2010 +0200 3.3 @@ -1851,6 +1851,7 @@ 3.4 3.5 special_basic_c_types = { 3.6 # name : (signed, longness) 3.7 + "Py_UNICODE" : (0, 0), 3.8 "Py_ssize_t" : (2, 0), 3.9 "size_t" : (0, 0), 3.10 }
4.1 --- a/Cython/Compiler/PyrexTypes.py Sun Apr 18 23:21:10 2010 +0200 4.2 +++ b/Cython/Compiler/PyrexTypes.py Mon Apr 19 09:50:19 2010 +0200 4.3 @@ -863,6 +863,20 @@ 4.4 return 'int' 4.5 4.6 4.7 +class CPyUnicodeIntType(CIntType): 4.8 + # Py_UNICODE 4.9 + 4.10 + # Conversion from a unicode string to Py_UNICODE at runtime is not 4.11 + # currently supported and may never be - we only convert from and 4.12 + # to integers here. The maximum value for a Py_UNICODE is 4.13 + # 1114111, so PyInt_FromLong() will do just fine here. 4.14 + 4.15 + to_py_function = "PyInt_FromLong" 4.16 + 4.17 + def sign_and_name(self): 4.18 + return "Py_UNICODE" 4.19 + 4.20 + 4.21 class CPySSizeTType(CIntType): 4.22 4.23 to_py_function = "PyInt_FromSsize_t" 4.24 @@ -2075,14 +2089,15 @@ 4.25 rank_to_type_name = ( 4.26 "char", # 0 4.27 "short", # 1 4.28 - "int", # 2 4.29 - "long", # 3 4.30 - "Py_ssize_t", # 4 4.31 - "size_t", # 5 4.32 - "PY_LONG_LONG", # 6 4.33 - "float", # 7 4.34 - "double", # 8 4.35 - "long double", # 9 4.36 + "Py_UNICODE", # 2 4.37 + "int", # 3 4.38 + "long", # 4 4.39 + "Py_ssize_t", # 5 4.40 + "size_t", # 6 4.41 + "PY_LONG_LONG", # 7 4.42 + "float", # 8 4.43 + "double", # 9 4.44 + "long double", # 10 4.45 ) 4.46 4.47 py_object_type = PyObjectType() 4.48 @@ -2093,29 +2108,30 @@ 4.49 4.50 c_uchar_type = CIntType(0, 0) 4.51 c_ushort_type = CIntType(1, 0) 4.52 -c_uint_type = CIntType(2, 0) 4.53 -c_ulong_type = CIntType(3, 0) 4.54 -c_ulonglong_type = CIntType(6, 0) 4.55 +c_py_unicode_type = CPyUnicodeIntType(2, 0) 4.56 +c_uint_type = CIntType(3, 0) 4.57 +c_ulong_type = CIntType(4, 0) 4.58 +c_ulonglong_type = CIntType(7, 0) 4.59 4.60 c_char_type = CIntType(0, 1) 4.61 c_short_type = CIntType(1, 1) 4.62 -c_int_type = CIntType(2, 1) 4.63 -c_long_type = CIntType(3, 1) 4.64 -c_longlong_type = CIntType(6, 1) 4.65 +c_int_type = CIntType(3, 1) 4.66 +c_long_type = CIntType(4, 1) 4.67 +c_longlong_type = CIntType(7, 1) 4.68 4.69 c_schar_type = CIntType(0, 2) 4.70 c_sshort_type = CIntType(1, 2) 4.71 -c_sint_type = CIntType(2, 2) 4.72 -c_slong_type = CIntType(3, 2) 4.73 -c_slonglong_type = CIntType(6, 2) 4.74 +c_sint_type = CIntType(3, 2) 4.75 +c_slong_type = CIntType(4, 2) 4.76 +c_slonglong_type = CIntType(7, 2) 4.77 4.78 -c_bint_type = CBIntType(2, 1) 4.79 -c_py_ssize_t_type = CPySSizeTType(4, 2) 4.80 -c_size_t_type = CSizeTType(5, 0) 4.81 +c_bint_type = CBIntType(3, 1) 4.82 +c_py_ssize_t_type = CPySSizeTType(5, 2) 4.83 +c_size_t_type = CSizeTType(6, 0) 4.84 4.85 -c_float_type = CFloatType(7, math_h_modifier='f') 4.86 -c_double_type = CFloatType(8) 4.87 -c_longdouble_type = CFloatType(9, math_h_modifier='l') 4.88 +c_float_type = CFloatType(8, math_h_modifier='f') 4.89 +c_double_type = CFloatType(9) 4.90 +c_longdouble_type = CFloatType(10, math_h_modifier='l') 4.91 4.92 c_float_complex_type = CComplexType(c_float_type) 4.93 c_double_complex_type = CComplexType(c_double_type) 4.94 @@ -2131,7 +2147,7 @@ 4.95 c_py_ssize_t_ptr_type = CPtrType(c_py_ssize_t_type) 4.96 c_size_t_ptr_type = CPtrType(c_size_t_type) 4.97 4.98 -c_returncode_type = CIntType(2, 1, is_returncode = 1) 4.99 +c_returncode_type = CIntType(3, 1, is_returncode = 1) 4.100 c_anon_enum_type = CAnonEnumType(-1, 1) 4.101 4.102 # the Py_buffer type is defined in Builtin.py 4.103 @@ -2165,6 +2181,7 @@ 4.104 (1, 0, "bint"): c_bint_type, 4.105 (0, 0, "size_t") : c_size_t_type, 4.106 (2, 0, "Py_ssize_t"): c_py_ssize_t_type, 4.107 + (0, 0, "Py_UNICODE"): c_py_unicode_type, 4.108 4.109 (1, 0, "float"): c_float_type, 4.110 (1, 0, "double"): c_double_type, 4.111 @@ -2383,6 +2400,8 @@ 4.112 signed = 2 4.113 elif name == 'size_t': 4.114 signed = 0 4.115 + elif name == 'Py_UNICODE': 4.116 + signed = 0 4.117 else: 4.118 if name.startswith('u'): 4.119 name = name[1:]
5.1 --- a/Cython/Shadow.py Sun Apr 18 23:21:10 2010 +0200 5.2 +++ b/Cython/Shadow.py Mon Apr 19 09:50:19 2010 +0200 5.3 @@ -174,7 +174,7 @@ 5.4 5.5 # Predefined types 5.6 5.7 -int_types = ['char', 'short', 'int', 'long', 'longlong', 'Py_ssize_t', 'size_t'] 5.8 +int_types = ['char', 'short', 'Py_UNICODE', 'int', 'long', 'longlong', 'Py_ssize_t', 'size_t'] 5.9 float_types = ['longdouble', 'double', 'float'] 5.10 complex_types = ['longdoublecomplex', 'doublecomplex', 'floatcomplex', 'complex'] 5.11 other_types = ['bint', 'void'] 5.12 @@ -183,7 +183,7 @@ 5.13 5.14 for name in int_types: 5.15 gs[name] = typedef(py_int) 5.16 - if not name.endswith('size_t'): 5.17 + if name != 'Py_UNICODE' and not name.endswith('size_t'): 5.18 gs['u'+name] = typedef(py_int) 5.19 gs['s'+name] = typedef(py_int) 5.20
6.1 --- a/tests/errors/e_strcoerce.pyx Sun Apr 18 23:21:10 2010 +0200 6.2 +++ b/tests/errors/e_strcoerce.pyx Mon Apr 19 09:50:19 2010 +0200 6.3 @@ -4,12 +4,14 @@ 6.4 6.5 cdef int x1 = "\xFF" # works 6.6 cdef int x2 = "\u0FFF" # fails 6.7 -cdef int x3 = u"\xFF" # fails 6.8 6.9 +cdef Py_UNICODE u1 = u"\xFF" # works 6.10 +cdef int u3 = u"\xFF" # fails 6.11 6.12 -_ERRORS = u""" 6.13 -2:14: Only single-character strings can be coerced into ints. 6.14 -3:14: Only single-character strings can be coerced into ints. 6.15 -6:15: Only single-character strings can be coerced into ints. 6.16 -7:14: Unicode objects do not support coercion to C types. 6.17 + 6.18 +_ERRORS = """ 6.19 +2:14: Only single-character string literals can be coerced into ints. 6.20 +3:14: Only single-character string literals can be coerced into ints. 6.21 +6:15: Only single-character string literals can be coerced into ints. 6.22 +9:14: Unicode literals do not support coercion to C types other than Py_UNICODE. 6.23 """
7.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 7.2 +++ b/tests/errors/py_unicode_type_errors.pyx Mon Apr 19 09:50:19 2010 +0200 7.3 @@ -0,0 +1,24 @@ 7.4 +# -*- coding: iso-8859-1 -*- 7.5 + 7.6 +cdef Py_UNICODE char_ASCII = u'A' 7.7 +cdef Py_UNICODE char_KLINGON = u'\uF8D2' 7.8 + 7.9 +def char_too_long_ASCII(): 7.10 + cdef Py_UNICODE c = u'AB' 7.11 + 7.12 +def char_too_long_Unicode(): 7.13 + cdef Py_UNICODE c = u'A\uF8D2' 7.14 + 7.15 +def char_too_long_bytes(): 7.16 + cdef Py_UNICODE c = b'AB' 7.17 + 7.18 +def char_too_long_latin1(): 7.19 + cdef Py_UNICODE char_bytes_latin1 = b'ö' 7.20 + 7.21 + 7.22 +_ERRORS = """ 7.23 +7:24: Only single-character Unicode string literals can be coerced into Py_UNICODE. 7.24 +10:24: Only single-character Unicode string literals can be coerced into Py_UNICODE. 7.25 +13:24: Only single-character string literals can be coerced into ints. 7.26 +16:40: Bytes literals cannot coerce to Py_UNICODE, use a unicode literal instead. 7.27 +"""
8.1 --- a/tests/errors/string_assignments.pyx Sun Apr 18 23:21:10 2010 +0200 8.2 +++ b/tests/errors/string_assignments.pyx Mon Apr 19 09:50:19 2010 +0200 8.3 @@ -50,7 +50,7 @@ 8.4 cdef list l_f3 = u1 8.5 8.6 _ERRORS = u""" 8.7 -25:20: Unicode objects do not support coercion to C types. 8.8 +25:20: Unicode literals do not support coercion to C types other than Py_UNICODE. 8.9 26:22: Unicode objects do not support coercion to C types. 8.10 27:22: 'str' objects do not support coercion to C types (use 'bytes'?). 8.11
9.1 --- a/tests/run/for_in_string.pyx Sun Apr 18 23:21:10 2010 +0200 9.2 +++ b/tests/run/for_in_string.pyx Mon Apr 19 09:50:19 2010 +0200 9.3 @@ -14,7 +14,7 @@ 9.4 'C' 9.5 """ 9.6 for c in s: 9.7 - if c == 'C': 9.8 + if c == b'C': 9.9 return 'C' 9.10 else: 9.11 return 'X' 9.12 @@ -28,21 +28,21 @@ 9.13 """ 9.14 cdef char c 9.15 for c in s: 9.16 - if c == 'C': 9.17 + if c == b'C': 9.18 return 'C' 9.19 else: 9.20 return 'X' 9.21 9.22 -def for_int_in_unicode(unicode s): 9.23 +def for_pyunicode_in_unicode(unicode s): 9.24 """ 9.25 - >>> for_int_in_unicode(unicode_abc) 9.26 + >>> for_pyunicode_in_unicode(unicode_abc) 9.27 'X' 9.28 - >>> for_int_in_unicode(unicode_ABC) 9.29 + >>> for_pyunicode_in_unicode(unicode_ABC) 9.30 'C' 9.31 """ 9.32 - cdef int c 9.33 + cdef Py_UNICODE c 9.34 for c in s: 9.35 - if c == 'C': 9.36 + if c == u'C': 9.37 return 'C' 9.38 else: 9.39 return 'X'
10.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 10.2 +++ b/tests/run/py_unicode_type.pyx Mon Apr 19 09:50:19 2010 +0200 10.3 @@ -0,0 +1,44 @@ 10.4 +# -*- coding: iso-8859-1 -*- 10.5 + 10.6 +cdef Py_UNICODE char_ASCII = u'A' 10.7 +cdef Py_UNICODE char_KLINGON = u'\uF8D2' 10.8 + 10.9 + 10.10 +def compare_ASCII(): 10.11 + """ 10.12 + >>> compare_ASCII() 10.13 + True 10.14 + False 10.15 + False 10.16 + """ 10.17 + print(char_ASCII == u'A') 10.18 + print(char_ASCII == u'B') 10.19 + print(char_ASCII == u'\uF8D2') 10.20 + 10.21 + 10.22 +def compare_KLINGON(): 10.23 + """ 10.24 + >>> compare_ASCII() 10.25 + True 10.26 + False 10.27 + False 10.28 + """ 10.29 + print(char_KLINGON == u'\uF8D2') 10.30 + print(char_KLINGON == u'A') 10.31 + print(char_KLINGON == u'B') 10.32 + 10.33 + 10.34 +def index_literal(int i): 10.35 + """ 10.36 + >>> index_literal(0) == '1' 10.37 + True 10.38 + >>> index_literal(-5) == '1' 10.39 + True 10.40 + >>> index_literal(2) == '3' 10.41 + True 10.42 + >>> index_literal(4) == '5' 10.43 + True 10.44 + """ 10.45 + # runtime casts are not currently supported 10.46 + #return <Py_UNICODE>(u"12345"[i]) 10.47 + return u"12345"[i]