cython-devel
changeset 3254:06ea96f9a568
native support for Py_UNICODE, coercion between 1-character unicode literals and Py_UNICODE, fix C iteration over unicode strings by using Py_UNICODE*
| author | Stefan Behnel <scoder@users.berlios.de> |
|---|---|
| date | Mon Apr 19 09:50:19 2010 +0200 (3 years ago) |
| parents | ac2b6c7aa8ab |
| children | 884b8a136a09 |
| files | Cython/Compiler/ExprNodes.py Cython/Compiler/Optimize.py Cython/Compiler/Parsing.py Cython/Compiler/PyrexTypes.py Cython/Shadow.py tests/errors/e_strcoerce.pyx tests/errors/py_unicode_type_errors.pyx tests/errors/string_assignments.pyx tests/run/for_in_string.pyx tests/run/py_unicode_type.pyx |
line diff
1.1 --- a/Cython/Compiler/ExprNodes.py Sun Apr 18 23:21:10 2010 +0200
1.2 +++ b/Cython/Compiler/ExprNodes.py Mon Apr 19 09:50:19 2010 +0200
1.3 @@ -860,7 +860,10 @@
1.4 def coerce_to(self, dst_type, env):
1.5 if dst_type.is_int:
1.6 if not self.can_coerce_to_char_literal():
1.7 - error(self.pos, "Only single-character strings can be coerced into ints.")
1.8 + error(self.pos, "Only single-character string literals can be coerced into ints.")
1.9 + return self
1.10 + if dst_type is PyrexTypes.c_py_unicode_type:
1.11 + error(self.pos, "Bytes literals cannot coerce to Py_UNICODE, use a unicode literal instead.")
1.12 return self
1.13 return CharNode(self.pos, value=self.value)
1.14
1.15 @@ -915,13 +918,22 @@
1.16 def coerce_to(self, dst_type, env):
1.17 if dst_type is self.type:
1.18 pass
1.19 + elif dst_type is PyrexTypes.c_py_unicode_type:
1.20 + if not self.can_coerce_to_char_literal():
1.21 + error(self.pos, "Only single-character Unicode string literals can be coerced into Py_UNICODE.")
1.22 + return self
1.23 + int_value = ord(self.value)
1.24 + return IntNode(self.pos, value=int_value, constant_result=int_value)
1.25 elif not dst_type.is_pyobject:
1.26 - error(self.pos, "Unicode objects do not support coercion to C types.")
1.27 + error(self.pos, "Unicode literals do not support coercion to C types other than Py_UNICODE.")
1.28 elif dst_type is not py_object_type:
1.29 if not self.check_for_coercion_error(dst_type):
1.30 self.fail_assignment(dst_type)
1.31 return self
1.32
1.33 + def can_coerce_to_char_literal(self):
1.34 + return len(self.value) == 1
1.35 +
1.36 def generate_evaluation_code(self, code):
1.37 self.result_code = code.get_py_string_const(self.value)
1.38
1.39 @@ -5426,10 +5438,10 @@
1.40 type1_can_be_int = False
1.41 type2_can_be_int = False
1.42
1.43 - if isinstance(operand1, (StringNode, BytesNode)) \
1.44 + if isinstance(operand1, (StringNode, BytesNode, UnicodeNode)) \
1.45 and operand1.can_coerce_to_char_literal():
1.46 type1_can_be_int = True
1.47 - if isinstance(operand2, (StringNode, BytesNode)) \
1.48 + if isinstance(operand2, (StringNode, BytesNode, UnicodeNode)) \
1.49 and operand2.can_coerce_to_char_literal():
1.50 type2_can_be_int = True
1.51
2.1 --- a/Cython/Compiler/Optimize.py Sun Apr 18 23:21:10 2010 +0200
2.2 +++ b/Cython/Compiler/Optimize.py Mon Apr 19 09:50:19 2010 +0200
2.3 @@ -137,7 +137,7 @@
2.4 return node
2.5
2.6 PyUnicode_AS_UNICODE_func_type = PyrexTypes.CFuncType(
2.7 - PyrexTypes.CPtrType(PyrexTypes.c_uint_type), [ # FIXME: return type is actually Py_UNICODE*
2.8 + PyrexTypes.CPtrType(PyrexTypes.c_py_unicode_type), [
2.9 PyrexTypes.CFuncTypeArg("s", Builtin.unicode_type, None)
2.10 ])
2.11
3.1 --- a/Cython/Compiler/Parsing.py Sun Apr 18 23:21:10 2010 +0200
3.2 +++ b/Cython/Compiler/Parsing.py Mon Apr 19 09:50:19 2010 +0200
3.3 @@ -1851,6 +1851,7 @@
3.4
3.5 special_basic_c_types = {
3.6 # name : (signed, longness)
3.7 + "Py_UNICODE" : (0, 0),
3.8 "Py_ssize_t" : (2, 0),
3.9 "size_t" : (0, 0),
3.10 }
4.1 --- a/Cython/Compiler/PyrexTypes.py Sun Apr 18 23:21:10 2010 +0200
4.2 +++ b/Cython/Compiler/PyrexTypes.py Mon Apr 19 09:50:19 2010 +0200
4.3 @@ -863,6 +863,20 @@
4.4 return 'int'
4.5
4.6
4.7 +class CPyUnicodeIntType(CIntType):
4.8 + # Py_UNICODE
4.9 +
4.10 + # Conversion from a unicode string to Py_UNICODE at runtime is not
4.11 + # currently supported and may never be - we only convert from and
4.12 + # to integers here. The maximum value for a Py_UNICODE is
4.13 + # 1114111, so PyInt_FromLong() will do just fine here.
4.14 +
4.15 + to_py_function = "PyInt_FromLong"
4.16 +
4.17 + def sign_and_name(self):
4.18 + return "Py_UNICODE"
4.19 +
4.20 +
4.21 class CPySSizeTType(CIntType):
4.22
4.23 to_py_function = "PyInt_FromSsize_t"
4.24 @@ -2075,14 +2089,15 @@
4.25 rank_to_type_name = (
4.26 "char", # 0
4.27 "short", # 1
4.28 - "int", # 2
4.29 - "long", # 3
4.30 - "Py_ssize_t", # 4
4.31 - "size_t", # 5
4.32 - "PY_LONG_LONG", # 6
4.33 - "float", # 7
4.34 - "double", # 8
4.35 - "long double", # 9
4.36 + "Py_UNICODE", # 2
4.37 + "int", # 3
4.38 + "long", # 4
4.39 + "Py_ssize_t", # 5
4.40 + "size_t", # 6
4.41 + "PY_LONG_LONG", # 7
4.42 + "float", # 8
4.43 + "double", # 9
4.44 + "long double", # 10
4.45 )
4.46
4.47 py_object_type = PyObjectType()
4.48 @@ -2093,29 +2108,30 @@
4.49
4.50 c_uchar_type = CIntType(0, 0)
4.51 c_ushort_type = CIntType(1, 0)
4.52 -c_uint_type = CIntType(2, 0)
4.53 -c_ulong_type = CIntType(3, 0)
4.54 -c_ulonglong_type = CIntType(6, 0)
4.55 +c_py_unicode_type = CPyUnicodeIntType(2, 0)
4.56 +c_uint_type = CIntType(3, 0)
4.57 +c_ulong_type = CIntType(4, 0)
4.58 +c_ulonglong_type = CIntType(7, 0)
4.59
4.60 c_char_type = CIntType(0, 1)
4.61 c_short_type = CIntType(1, 1)
4.62 -c_int_type = CIntType(2, 1)
4.63 -c_long_type = CIntType(3, 1)
4.64 -c_longlong_type = CIntType(6, 1)
4.65 +c_int_type = CIntType(3, 1)
4.66 +c_long_type = CIntType(4, 1)
4.67 +c_longlong_type = CIntType(7, 1)
4.68
4.69 c_schar_type = CIntType(0, 2)
4.70 c_sshort_type = CIntType(1, 2)
4.71 -c_sint_type = CIntType(2, 2)
4.72 -c_slong_type = CIntType(3, 2)
4.73 -c_slonglong_type = CIntType(6, 2)
4.74 +c_sint_type = CIntType(3, 2)
4.75 +c_slong_type = CIntType(4, 2)
4.76 +c_slonglong_type = CIntType(7, 2)
4.77
4.78 -c_bint_type = CBIntType(2, 1)
4.79 -c_py_ssize_t_type = CPySSizeTType(4, 2)
4.80 -c_size_t_type = CSizeTType(5, 0)
4.81 +c_bint_type = CBIntType(3, 1)
4.82 +c_py_ssize_t_type = CPySSizeTType(5, 2)
4.83 +c_size_t_type = CSizeTType(6, 0)
4.84
4.85 -c_float_type = CFloatType(7, math_h_modifier='f')
4.86 -c_double_type = CFloatType(8)
4.87 -c_longdouble_type = CFloatType(9, math_h_modifier='l')
4.88 +c_float_type = CFloatType(8, math_h_modifier='f')
4.89 +c_double_type = CFloatType(9)
4.90 +c_longdouble_type = CFloatType(10, math_h_modifier='l')
4.91
4.92 c_float_complex_type = CComplexType(c_float_type)
4.93 c_double_complex_type = CComplexType(c_double_type)
4.94 @@ -2131,7 +2147,7 @@
4.95 c_py_ssize_t_ptr_type = CPtrType(c_py_ssize_t_type)
4.96 c_size_t_ptr_type = CPtrType(c_size_t_type)
4.97
4.98 -c_returncode_type = CIntType(2, 1, is_returncode = 1)
4.99 +c_returncode_type = CIntType(3, 1, is_returncode = 1)
4.100 c_anon_enum_type = CAnonEnumType(-1, 1)
4.101
4.102 # the Py_buffer type is defined in Builtin.py
4.103 @@ -2165,6 +2181,7 @@
4.104 (1, 0, "bint"): c_bint_type,
4.105 (0, 0, "size_t") : c_size_t_type,
4.106 (2, 0, "Py_ssize_t"): c_py_ssize_t_type,
4.107 + (0, 0, "Py_UNICODE"): c_py_unicode_type,
4.108
4.109 (1, 0, "float"): c_float_type,
4.110 (1, 0, "double"): c_double_type,
4.111 @@ -2383,6 +2400,8 @@
4.112 signed = 2
4.113 elif name == 'size_t':
4.114 signed = 0
4.115 + elif name == 'Py_UNICODE':
4.116 + signed = 0
4.117 else:
4.118 if name.startswith('u'):
4.119 name = name[1:]
5.1 --- a/Cython/Shadow.py Sun Apr 18 23:21:10 2010 +0200
5.2 +++ b/Cython/Shadow.py Mon Apr 19 09:50:19 2010 +0200
5.3 @@ -174,7 +174,7 @@
5.4
5.5 # Predefined types
5.6
5.7 -int_types = ['char', 'short', 'int', 'long', 'longlong', 'Py_ssize_t', 'size_t']
5.8 +int_types = ['char', 'short', 'Py_UNICODE', 'int', 'long', 'longlong', 'Py_ssize_t', 'size_t']
5.9 float_types = ['longdouble', 'double', 'float']
5.10 complex_types = ['longdoublecomplex', 'doublecomplex', 'floatcomplex', 'complex']
5.11 other_types = ['bint', 'void']
5.12 @@ -183,7 +183,7 @@
5.13
5.14 for name in int_types:
5.15 gs[name] = typedef(py_int)
5.16 - if not name.endswith('size_t'):
5.17 + if name != 'Py_UNICODE' and not name.endswith('size_t'):
5.18 gs['u'+name] = typedef(py_int)
5.19 gs['s'+name] = typedef(py_int)
5.20
6.1 --- a/tests/errors/e_strcoerce.pyx Sun Apr 18 23:21:10 2010 +0200
6.2 +++ b/tests/errors/e_strcoerce.pyx Mon Apr 19 09:50:19 2010 +0200
6.3 @@ -4,12 +4,14 @@
6.4
6.5 cdef int x1 = "\xFF" # works
6.6 cdef int x2 = "\u0FFF" # fails
6.7 -cdef int x3 = u"\xFF" # fails
6.8
6.9 +cdef Py_UNICODE u1 = u"\xFF" # works
6.10 +cdef int u3 = u"\xFF" # fails
6.11
6.12 -_ERRORS = u"""
6.13 -2:14: Only single-character strings can be coerced into ints.
6.14 -3:14: Only single-character strings can be coerced into ints.
6.15 -6:15: Only single-character strings can be coerced into ints.
6.16 -7:14: Unicode objects do not support coercion to C types.
6.17 +
6.18 +_ERRORS = """
6.19 +2:14: Only single-character string literals can be coerced into ints.
6.20 +3:14: Only single-character string literals can be coerced into ints.
6.21 +6:15: Only single-character string literals can be coerced into ints.
6.22 +9:14: Unicode literals do not support coercion to C types other than Py_UNICODE.
6.23 """
7.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
7.2 +++ b/tests/errors/py_unicode_type_errors.pyx Mon Apr 19 09:50:19 2010 +0200
7.3 @@ -0,0 +1,24 @@
7.4 +# -*- coding: iso-8859-1 -*-
7.5 +
7.6 +cdef Py_UNICODE char_ASCII = u'A'
7.7 +cdef Py_UNICODE char_KLINGON = u'\uF8D2'
7.8 +
7.9 +def char_too_long_ASCII():
7.10 + cdef Py_UNICODE c = u'AB'
7.11 +
7.12 +def char_too_long_Unicode():
7.13 + cdef Py_UNICODE c = u'A\uF8D2'
7.14 +
7.15 +def char_too_long_bytes():
7.16 + cdef Py_UNICODE c = b'AB'
7.17 +
7.18 +def char_too_long_latin1():
7.19 + cdef Py_UNICODE char_bytes_latin1 = b'ö'
7.20 +
7.21 +
7.22 +_ERRORS = """
7.23 +7:24: Only single-character Unicode string literals can be coerced into Py_UNICODE.
7.24 +10:24: Only single-character Unicode string literals can be coerced into Py_UNICODE.
7.25 +13:24: Only single-character string literals can be coerced into ints.
7.26 +16:40: Bytes literals cannot coerce to Py_UNICODE, use a unicode literal instead.
7.27 +"""
8.1 --- a/tests/errors/string_assignments.pyx Sun Apr 18 23:21:10 2010 +0200
8.2 +++ b/tests/errors/string_assignments.pyx Mon Apr 19 09:50:19 2010 +0200
8.3 @@ -50,7 +50,7 @@
8.4 cdef list l_f3 = u1
8.5
8.6 _ERRORS = u"""
8.7 -25:20: Unicode objects do not support coercion to C types.
8.8 +25:20: Unicode literals do not support coercion to C types other than Py_UNICODE.
8.9 26:22: Unicode objects do not support coercion to C types.
8.10 27:22: 'str' objects do not support coercion to C types (use 'bytes'?).
8.11
9.1 --- a/tests/run/for_in_string.pyx Sun Apr 18 23:21:10 2010 +0200
9.2 +++ b/tests/run/for_in_string.pyx Mon Apr 19 09:50:19 2010 +0200
9.3 @@ -14,7 +14,7 @@
9.4 'C'
9.5 """
9.6 for c in s:
9.7 - if c == 'C':
9.8 + if c == b'C':
9.9 return 'C'
9.10 else:
9.11 return 'X'
9.12 @@ -28,21 +28,21 @@
9.13 """
9.14 cdef char c
9.15 for c in s:
9.16 - if c == 'C':
9.17 + if c == b'C':
9.18 return 'C'
9.19 else:
9.20 return 'X'
9.21
9.22 -def for_int_in_unicode(unicode s):
9.23 +def for_pyunicode_in_unicode(unicode s):
9.24 """
9.25 - >>> for_int_in_unicode(unicode_abc)
9.26 + >>> for_pyunicode_in_unicode(unicode_abc)
9.27 'X'
9.28 - >>> for_int_in_unicode(unicode_ABC)
9.29 + >>> for_pyunicode_in_unicode(unicode_ABC)
9.30 'C'
9.31 """
9.32 - cdef int c
9.33 + cdef Py_UNICODE c
9.34 for c in s:
9.35 - if c == 'C':
9.36 + if c == u'C':
9.37 return 'C'
9.38 else:
9.39 return 'X'
10.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
10.2 +++ b/tests/run/py_unicode_type.pyx Mon Apr 19 09:50:19 2010 +0200
10.3 @@ -0,0 +1,44 @@
10.4 +# -*- coding: iso-8859-1 -*-
10.5 +
10.6 +cdef Py_UNICODE char_ASCII = u'A'
10.7 +cdef Py_UNICODE char_KLINGON = u'\uF8D2'
10.8 +
10.9 +
10.10 +def compare_ASCII():
10.11 + """
10.12 + >>> compare_ASCII()
10.13 + True
10.14 + False
10.15 + False
10.16 + """
10.17 + print(char_ASCII == u'A')
10.18 + print(char_ASCII == u'B')
10.19 + print(char_ASCII == u'\uF8D2')
10.20 +
10.21 +
10.22 +def compare_KLINGON():
10.23 + """
10.24 + >>> compare_ASCII()
10.25 + True
10.26 + False
10.27 + False
10.28 + """
10.29 + print(char_KLINGON == u'\uF8D2')
10.30 + print(char_KLINGON == u'A')
10.31 + print(char_KLINGON == u'B')
10.32 +
10.33 +
10.34 +def index_literal(int i):
10.35 + """
10.36 + >>> index_literal(0) == '1'
10.37 + True
10.38 + >>> index_literal(-5) == '1'
10.39 + True
10.40 + >>> index_literal(2) == '3'
10.41 + True
10.42 + >>> index_literal(4) == '5'
10.43 + True
10.44 + """
10.45 + # runtime casts are not currently supported
10.46 + #return <Py_UNICODE>(u"12345"[i])
10.47 + return u"12345"[i]
