Lichen

Changeset

934:2989aab1b4f7
3 months ago Paul Boddie raw files shortlog changelog graph Renamed the utf8string class to unicode, eliminating the unicode function. This means that the simple case of merely returning an object if it is already a Unicode object no longer occurs when using the unicode callable, but such behaviour might be better supported with more general customised instantiation functionality.
common.py (file) generator.py (file) lib/__builtins__/__init__.py (file) lib/__builtins__/character.py (file) lib/__builtins__/int.py (file) lib/__builtins__/stream.py (file) lib/__builtins__/unicode.py (file) tests/unicode.py (file)
     1.1 --- a/common.py	Mon Jun 28 22:29:21 2021 +0200
     1.2 +++ b/common.py	Tue Jun 29 22:24:09 2021 +0200
     1.3 @@ -3,8 +3,7 @@
     1.4  """
     1.5  Common functions.
     1.6  
     1.7 -Copyright (C) 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016,
     1.8 -              2017, 2018, 2019 Paul Boddie <paul@boddie.org.uk>
     1.9 +Copyright (C) 2007-2019, 2021 Paul Boddie <paul@boddie.org.uk>
    1.10  
    1.11  This program is free software; you can redistribute it and/or modify it under
    1.12  the terms of the GNU General Public License as published by the Free Software
    1.13 @@ -1597,8 +1596,6 @@
    1.14  
    1.15      if name == "string":
    1.16          modname = "str"
    1.17 -    elif name == "utf8string":
    1.18 -        modname = "unicode"
    1.19      elif name == "NoneType":
    1.20          modname = "none"
    1.21      else:
    1.22 @@ -1612,8 +1609,6 @@
    1.23  
    1.24      if name == "str":
    1.25          return "string"
    1.26 -    elif name == "unicode":
    1.27 -        return "utf8string"
    1.28      else:
    1.29          return name
    1.30  
     2.1 --- a/generator.py	Mon Jun 28 22:29:21 2021 +0200
     2.2 +++ b/generator.py	Tue Jun 29 22:24:09 2021 +0200
     2.3 @@ -3,7 +3,7 @@
     2.4  """
     2.5  Generate C code from object layouts and other deduced information.
     2.6  
     2.7 -Copyright (C) 2015, 2016, 2017, 2018, 2019 Paul Boddie <paul@boddie.org.uk>
     2.8 +Copyright (C) 2015-2019, 2021 Paul Boddie <paul@boddie.org.uk>
     2.9  
    2.10  This program is free software; you can redistribute it and/or modify it under
    2.11  the terms of the GNU General Public License as published by the Free Software
    2.12 @@ -49,7 +49,7 @@
    2.13      string_type = "__builtins__.str.string"
    2.14      tuple_type = "__builtins__.tuple.tuple"
    2.15      type_type = "__builtins__.core.type"
    2.16 -    unicode_type = "__builtins__.unicode.utf8string"
    2.17 +    unicode_type = "__builtins__.unicode.unicode"
    2.18  
    2.19      none_value = "__builtins__.none.None"
    2.20  
     3.1 --- a/lib/__builtins__/__init__.py	Mon Jun 28 22:29:21 2021 +0200
     3.2 +++ b/lib/__builtins__/__init__.py	Tue Jun 29 22:24:09 2021 +0200
     3.3 @@ -3,7 +3,7 @@
     3.4  """
     3.5  Simple built-in classes and functions.
     3.6  
     3.7 -Copyright (C) 2015, 2016, 2017, 2019 Paul Boddie <paul@boddie.org.uk>
     3.8 +Copyright (C) 2015, 2016, 2017, 2019, 2021 Paul Boddie <paul@boddie.org.uk>
     3.9  
    3.10  This program is free software; you can redistribute it and/or modify it under
    3.11  the terms of the GNU General Public License as published by the Free Software
    3.12 @@ -70,7 +70,7 @@
    3.13  from __builtins__.set import frozenset, set
    3.14  from __builtins__.str import basestring, str, string
    3.15  from __builtins__.tuple import tuple
    3.16 -from __builtins__.unicode import unicode, utf8string
    3.17 +from __builtins__.unicode import unicode
    3.18  
    3.19  # Functions.
    3.20  
     4.1 --- a/lib/__builtins__/character.py	Mon Jun 28 22:29:21 2021 +0200
     4.2 +++ b/lib/__builtins__/character.py	Tue Jun 29 22:24:09 2021 +0200
     4.3 @@ -103,7 +103,7 @@
     4.4      check_int(i)
     4.5  
     4.6      if 0 <= i <= 2097151:
     4.7 -        return utf8string(unicode_unichr(i))
     4.8 +        return unicode(unicode_unichr(i))
     4.9      else:
    4.10          raise ValueError, i
    4.11  
     5.1 --- a/lib/__builtins__/int.py	Mon Jun 28 22:29:21 2021 +0200
     5.2 +++ b/lib/__builtins__/int.py	Tue Jun 29 22:24:09 2021 +0200
     5.3 @@ -20,7 +20,7 @@
     5.4  """
     5.5  
     5.6  from __builtins__.str import basestring
     5.7 -from __builtins__.unicode import utf8string
     5.8 +from __builtins__.unicode import unicode
     5.9  from native import get_maxint, get_minint, is_int, \
    5.10                     int_add, int_and, int_div, int_eq, int_ge, int_gt, \
    5.11                     int_lshift, int_le, int_lt, int_mod, int_mul, int_ne, \
    5.12 @@ -278,7 +278,7 @@
    5.13  
    5.14          "Return a string representation."
    5.15  
    5.16 -        return utf8string(int_str(self))
    5.17 +        return unicode(int_str(self))
    5.18  
    5.19      __repr__ = __str__
    5.20  
     6.1 --- a/lib/__builtins__/stream.py	Mon Jun 28 22:29:21 2021 +0200
     6.2 +++ b/lib/__builtins__/stream.py	Tue Jun 29 22:24:09 2021 +0200
     6.3 @@ -144,7 +144,7 @@
     6.4          # Encode text as bytes if necessary. When the encoding is not set, any
     6.5          # original encoding of the text will be applied.
     6.6  
     6.7 -        if _isinstance(s, utf8string):
     6.8 +        if _isinstance(s, unicode):
     6.9              s = s.encode(self.encoding)
    6.10  
    6.11          fwrite(self.__data__, s)
     7.1 --- a/lib/__builtins__/unicode.py	Mon Jun 28 22:29:21 2021 +0200
     7.2 +++ b/lib/__builtins__/unicode.py	Tue Jun 29 22:24:09 2021 +0200
     7.3 @@ -3,7 +3,7 @@
     7.4  """
     7.5  Unicode objects.
     7.6  
     7.7 -Copyright (C) 2015, 2016, 2017 Paul Boddie <paul@boddie.org.uk>
     7.8 +Copyright (C) 2015, 2016, 2017, 2021 Paul Boddie <paul@boddie.org.uk>
     7.9  
    7.10  This program is free software; you can redistribute it and/or modify it under
    7.11  the terms of the GNU General Public License as published by the Free Software
    7.12 @@ -25,21 +25,58 @@
    7.13  from native import str_add, unicode_len, unicode_ord, unicode_substr, \
    7.14                     isinstance as _isinstance
    7.15  
    7.16 -class utf8string(basestring):
    7.17 +class unicode(basestring):
    7.18  
    7.19      "A character string representation based on UTF-8."
    7.20  
    7.21 -    def __init__(self, other=None, encoding=None):
    7.22 +    def __init__(self, s, encoding=None, original=None):
    7.23  
    7.24          """
    7.25 -        Initialise the string, perhaps from 'other', with any original
    7.26 -        'encoding' indicated.
    7.27 +        Initialise the string from 'other', employing any indicated 'encoding'
    7.28 +        for the provided string data.
    7.29 +
    7.30 +        If 'original' is indicated, this may be used to override the original
    7.31 +        encoding. This is useful when the string data is already in UTF-8
    7.32 +        format, but where the original encoding needs to be communicated.
    7.33          """
    7.34  
    7.35 -        get_using(basestring.__init__, self)(other)
    7.36 -        self.encoding = encoding
    7.37          self.length = None
    7.38  
    7.39 +        # Initialise using another Unicode object.
    7.40 +
    7.41 +        if _isinstance(s, unicode):
    7.42 +            get_using(basestring.__init__, self)(s)
    7.43 +            self.encoding = s.encoding
    7.44 +
    7.45 +        # Initialise using suitable string data but with an explicit original
    7.46 +        # encoding.
    7.47 +
    7.48 +        elif original:
    7.49 +            get_using(basestring.__init__, self)(s)
    7.50 +            self.encoding = original
    7.51 +
    7.52 +        # Initialise using string data having either UTF-8 or another encoding,
    7.53 +        # converting to UTF-8 and retaining the encoding details as the original
    7.54 +        # encoding.
    7.55 +
    7.56 +        else:
    7.57 +            # Obtain a string representation.
    7.58 +
    7.59 +            s = s.__str__()
    7.60 +
    7.61 +            # Convert the string to UTF-8. Even if the stated encoding is UTF-8, it
    7.62 +            # needs to be validated.
    7.63 +
    7.64 +            to_utf8 = Converter(encoding or "UTF-8", "UTF-8")
    7.65 +
    7.66 +            try:
    7.67 +                to_utf8.feed(s)
    7.68 +                get_using(basestring.__init__, self)(str(to_utf8))
    7.69 +            finally:
    7.70 +                to_utf8.close()
    7.71 +
    7.72 +            self.encoding = encoding
    7.73 +
    7.74      def _binary_op(self, op, other, sizes=False):
    7.75  
    7.76          "Perform 'op' on this object and 'other' if appropriate."
    7.77 @@ -51,7 +88,7 @@
    7.78  
    7.79          # Combining text with bytes.
    7.80  
    7.81 -        if not _isinstance(other, utf8string):
    7.82 +        if not _isinstance(other, unicode):
    7.83              s = self.encode()
    7.84          else:
    7.85              s = self
    7.86 @@ -72,7 +109,7 @@
    7.87  
    7.88          # Combining text with bytes.
    7.89  
    7.90 -        if not _isinstance(other, utf8string):
    7.91 +        if not _isinstance(other, unicode):
    7.92              s = self.encode()
    7.93          else:
    7.94              s = self
    7.95 @@ -86,8 +123,8 @@
    7.96  
    7.97          "Convert 'result' to a Unicode object if 'other' already is."
    7.98  
    7.99 -        if _isinstance(other, utf8string):
   7.100 -            return utf8string(result, self.encoding)
   7.101 +        if _isinstance(other, unicode):
   7.102 +            return unicode(result, None, self.encoding)
   7.103          else:
   7.104              return result
   7.105  
   7.106 @@ -188,15 +225,14 @@
   7.107              elif nonempty:
   7.108                  b.append(self)
   7.109  
   7.110 -            if _isinstance(s, utf8string):
   7.111 +            if _isinstance(s, unicode):
   7.112                  encoding = None
   7.113  
   7.114              b.append(s)
   7.115  
   7.116          s = str(b)
   7.117          if encoding:
   7.118 -            s = utf8string(s)
   7.119 -            s.encoding = encoding
   7.120 +            s = unicode(s, None, encoding)
   7.121          return s
   7.122  
   7.123      # Special implementation methods.
   7.124 @@ -204,9 +240,9 @@
   7.125      def __get_single_item__(self, index):
   7.126      
   7.127          "Return the item at the normalised (positive) 'index'."
   7.128 -    
   7.129 + 
   7.130          self._check_index(index)
   7.131 -        return utf8string(unicode_substr(self.__data__, self.__size__, index, index + 1, 1), self.encoding)
   7.132 +        return unicode(unicode_substr(self.__data__, self.__size__, index, index + 1, 1), None, self.encoding)
   7.133  
   7.134      def __get_multiple_items__(self, start, end, step):
   7.135  
   7.136 @@ -224,29 +260,6 @@
   7.137              raise ValueError(step)
   7.138  
   7.139          l = get_using(basestring.__get_multiple_items__, self)(start, end, step)
   7.140 -        return utf8string("".join(l), self.encoding)
   7.141 -
   7.142 -def unicode(s, encoding):
   7.143 -
   7.144 -    "Convert 's' to a Unicode object, interpreting 's' as using 'encoding'."
   7.145 -
   7.146 -    if isinstance(s, utf8string):
   7.147 -        return s
   7.148 -
   7.149 -    # Obtain a string representation.
   7.150 -
   7.151 -    s = s.__str__()
   7.152 -
   7.153 -    # Convert the string to UTF-8. Even if the stated encoding is UTF-8, it
   7.154 -    # needs to be validated.
   7.155 -
   7.156 -    to_utf8 = Converter(encoding, "UTF-8")
   7.157 -
   7.158 -    try:
   7.159 -        to_utf8.feed(s)
   7.160 -        return utf8string(str(to_utf8), encoding)
   7.161 -
   7.162 -    finally:
   7.163 -        to_utf8.close()
   7.164 +        return unicode("".join(l), None, self.encoding)
   7.165  
   7.166  # vim: tabstop=4 expandtab shiftwidth=4
     8.1 --- a/tests/unicode.py	Mon Jun 28 22:29:21 2021 +0200
     8.2 +++ b/tests/unicode.py	Tue Jun 29 22:24:09 2021 +0200
     8.3 @@ -48,7 +48,7 @@
     8.4  s7 = r"\346\370\345"
     8.5  print "Untranslated values:"
     8.6  print s7                            # \346\370\345
     8.7 -print s7.__class__                  # __builtins__.unicode.utf8string
     8.8 +print s7.__class__                  # __builtins__.unicode.unicode
     8.9  print len(s7)                       # 12
    8.10  
    8.11  # Obtain text and print it.
    8.12 @@ -58,7 +58,7 @@
    8.13  u = unicode(b"זרו", "ISO-8859-15")
    8.14  print "Unicode values:"
    8.15  print u                             # זרו
    8.16 -print u.__class__                   # __builtins__.unicode.utf8string
    8.17 +print u.__class__                   # __builtins__.unicode.unicode
    8.18  print u.encode("ISO-8859-15")       # זרו
    8.19  print u.encoding                    # ISO-8859-15
    8.20  print len(u)                        # 3
    8.21 @@ -68,7 +68,7 @@
    8.22  u2 = u"זרו"
    8.23  print "Unicode values:"
    8.24  print u2                            # זרו
    8.25 -print u2.__class__                  # __builtins__.unicode.utf8string
    8.26 +print u2.__class__                  # __builtins__.unicode.unicode
    8.27  print u2.encode("ISO-8859-15")      # זרו
    8.28  print u2.encoding                   # ISO-8859-15
    8.29  print len(u2)                       # 3
    8.30 @@ -78,7 +78,7 @@
    8.31  u3 = "זרו"
    8.32  print "Unicode values:"
    8.33  print u3                            # זרו
    8.34 -print u3.__class__                  # __builtins__.unicode.utf8string
    8.35 +print u3.__class__                  # __builtins__.unicode.unicode
    8.36  print u3.encode("ISO-8859-15")      # זרו
    8.37  print u3.encoding                   # ISO-8859-15
    8.38  print len(u3)                       # 3
    8.39 @@ -88,7 +88,7 @@
    8.40  u4 = unicode("זרו", "ISO-8859-15")
    8.41  print "Unicode values:"
    8.42  print u4                            # זרו
    8.43 -print u4.__class__                  # __builtins__.unicode.utf8string
    8.44 +print u4.__class__                  # __builtins__.unicode.unicode
    8.45  print u4.encode("ISO-8859-15")      # זרו
    8.46  print u4.encoding                   # ISO-8859-15
    8.47  print len(u4)                       # 3
    8.48 @@ -163,7 +163,7 @@
    8.49  uu2 = u + u2
    8.50  print "Unicode values:"
    8.51  print uu2                           # זרוזרו
    8.52 -print uu2.__class__                 # __builtins__.unicode.utf8string
    8.53 +print uu2.__class__                 # __builtins__.unicode.unicode
    8.54  print uu2.encoding                  # ISO-8859-15
    8.55  print len(uu2)                      # 6
    8.56  
    8.57 @@ -195,7 +195,7 @@
    8.58  # Test character access.
    8.59  
    8.60  u0 = u[0]
    8.61 -print u0.__class__                  # __builtins__.unicode.utf8string
    8.62 +print u0.__class__                  # __builtins__.unicode.unicode
    8.63  print u0.encoding                   # ISO-8859-15
    8.64  print u0                            # ז
    8.65  print u[-1]                         # ו