Lichen

Changeset

935:7dec14799571
2021-06-29 Paul Boddie raw files shortlog changelog graph Merged changes from the default branch. trailing-data
generator.py (file)
     1.1 --- a/common.py	Sun Jun 27 22:14:51 2021 +0200
     1.2 +++ b/common.py	Tue Jun 29 22:24:48 2021 +0200
     1.3 @@ -3,8 +3,7 @@
     1.4  """
     1.5  Common functions.
     1.6  
     1.7 -Copyright (C) 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016,
     1.8 -              2017, 2018, 2019 Paul Boddie <paul@boddie.org.uk>
     1.9 +Copyright (C) 2007-2019, 2021 Paul Boddie <paul@boddie.org.uk>
    1.10  
    1.11  This program is free software; you can redistribute it and/or modify it under
    1.12  the terms of the GNU General Public License as published by the Free Software
    1.13 @@ -1597,8 +1596,6 @@
    1.14  
    1.15      if name == "string":
    1.16          modname = "str"
    1.17 -    elif name == "utf8string":
    1.18 -        modname = "unicode"
    1.19      elif name == "NoneType":
    1.20          modname = "none"
    1.21      else:
    1.22 @@ -1612,8 +1609,6 @@
    1.23  
    1.24      if name == "str":
    1.25          return "string"
    1.26 -    elif name == "unicode":
    1.27 -        return "utf8string"
    1.28      else:
    1.29          return name
    1.30  
     2.1 --- a/docs/wiki/History	Sun Jun 27 22:14:51 2021 +0200
     2.2 +++ b/docs/wiki/History	Tue Jun 29 22:24:48 2021 +0200
     2.3 @@ -71,7 +71,7 @@
     2.4  == Current Work ==
     2.5  
     2.6  It was with such realisations that a new project was effectively born.
     2.7 -Tentatively called "!PythonLight" but renamed to "Lichen" as the code matured,
     2.8 +Tentatively called "PythonLight" but renamed to "Lichen" as the code matured,
     2.9  the objectives now involved a simpler processing framework that merely
    2.10  attempted to catalogue structure members, to determine the origins of such
    2.11  members, and to record data flow within namespaces in order to determine
     3.1 --- a/docs/wiki/Toolchain	Sun Jun 27 22:14:51 2021 +0200
     3.2 +++ b/docs/wiki/Toolchain	Tue Jun 29 22:24:48 2021 +0200
     3.3 @@ -73,12 +73,12 @@
     3.4  which the `parser` module effectively is (as would the `ast` module also be if
     3.5  it were used here), with it typically being implemented as an extension module
     3.6  in a non-Python language (in C for CPython, in Java for Jython, and so on).
     3.7 -Fortunately, the !PyPy project implemented their own parsing module,
     3.8 -`pyparser`, that is intended to be used within the !PyPy environment together
     3.9 -with their own `ast` equivalent, but it has been possible to rework `pyparser`
    3.10 -to produce representations that are compatible with the `compiler` package,
    3.11 -itself being modified in various ways to achieve compatibility (and also to
    3.12 -provide various other conveniences).
    3.13 +Fortunately, the [[http://pypy.org/|PyPy]] project implemented their own
    3.14 +parsing module, `pyparser`, that is intended to be used within the PyPy
    3.15 +environment together with their own `ast` equivalent, but it has been possible
    3.16 +to rework `pyparser` to produce representations that are compatible with the
    3.17 +`compiler` package, itself being modified in various ways to achieve
    3.18 +compatibility (and also to provide various other conveniences).
    3.19  
    3.20  == Program Analysis ==
    3.21  
     4.1 --- a/generator.py	Sun Jun 27 22:14:51 2021 +0200
     4.2 +++ b/generator.py	Tue Jun 29 22:24:48 2021 +0200
     4.3 @@ -3,7 +3,7 @@
     4.4  """
     4.5  Generate C code from object layouts and other deduced information.
     4.6  
     4.7 -Copyright (C) 2015, 2016, 2017, 2018, 2019 Paul Boddie <paul@boddie.org.uk>
     4.8 +Copyright (C) 2015-2019, 2021 Paul Boddie <paul@boddie.org.uk>
     4.9  
    4.10  This program is free software; you can redistribute it and/or modify it under
    4.11  the terms of the GNU General Public License as published by the Free Software
    4.12 @@ -51,7 +51,7 @@
    4.13      string_type = "__builtins__.str.string"
    4.14      tuple_type = "__builtins__.tuple.tuple"
    4.15      type_type = "__builtins__.core.type"
    4.16 -    unicode_type = "__builtins__.unicode.utf8string"
    4.17 +    unicode_type = "__builtins__.unicode.unicode"
    4.18  
    4.19      none_value = "__builtins__.none.None"
    4.20  
    4.21 @@ -1265,14 +1265,14 @@
    4.22  
    4.23          # Special-case the integer type.
    4.24  
    4.25 +        # Here, the __builtins__.int.new_int function is called with the
    4.26 +        # initialiser's parameter.
    4.27 +
    4.28          if path == self.int_type:
    4.29              print >>f_code, """\
    4.30 -__attr %s(__attr __self, __attr number_or_string)
    4.31 +__attr %s(__attr __self, __attr number_or_string, __attr base)
    4.32  {
    4.33 -    if (!__BOOL(__fn_native_int_is_int(__self, number_or_string)))
    4.34 -        __raise_value_error(number_or_string);
    4.35 -
    4.36 -    return number_or_string;
    4.37 +    return __fn___builtins___int_new_int(__NULL, number_or_string, base);
    4.38  }
    4.39  """ % (
    4.40                  encode_instantiator_pointer(path),
     5.1 --- a/lib/__builtins__/__init__.py	Sun Jun 27 22:14:51 2021 +0200
     5.2 +++ b/lib/__builtins__/__init__.py	Tue Jun 29 22:24:48 2021 +0200
     5.3 @@ -3,7 +3,7 @@
     5.4  """
     5.5  Simple built-in classes and functions.
     5.6  
     5.7 -Copyright (C) 2015, 2016, 2017, 2019 Paul Boddie <paul@boddie.org.uk>
     5.8 +Copyright (C) 2015, 2016, 2017, 2019, 2021 Paul Boddie <paul@boddie.org.uk>
     5.9  
    5.10  This program is free software; you can redistribute it and/or modify it under
    5.11  the terms of the GNU General Public License as published by the Free Software
    5.12 @@ -70,7 +70,7 @@
    5.13  from __builtins__.set import frozenset, set
    5.14  from __builtins__.str import basestring, str, string
    5.15  from __builtins__.tuple import tuple
    5.16 -from __builtins__.unicode import unicode, utf8string
    5.17 +from __builtins__.unicode import unicode
    5.18  
    5.19  # Functions.
    5.20  
     6.1 --- a/lib/__builtins__/character.py	Sun Jun 27 22:14:51 2021 +0200
     6.2 +++ b/lib/__builtins__/character.py	Tue Jun 29 22:24:48 2021 +0200
     6.3 @@ -103,7 +103,7 @@
     6.4      check_int(i)
     6.5  
     6.6      if 0 <= i <= 2097151:
     6.7 -        return utf8string(unicode_unichr(i))
     6.8 +        return unicode(unicode_unichr(i))
     6.9      else:
    6.10          raise ValueError, i
    6.11  
     7.1 --- a/lib/__builtins__/int.py	Sun Jun 27 22:14:51 2021 +0200
     7.2 +++ b/lib/__builtins__/int.py	Tue Jun 29 22:24:48 2021 +0200
     7.3 @@ -19,22 +19,55 @@
     7.4  this program.  If not, see <http://www.gnu.org/licenses/>.
     7.5  """
     7.6  
     7.7 -from __builtins__.unicode import utf8string
     7.8 +from __builtins__.str import basestring
     7.9 +from __builtins__.unicode import unicode
    7.10  from native import get_maxint, get_minint, is_int, \
    7.11                     int_add, int_and, int_div, int_eq, int_ge, int_gt, \
    7.12                     int_lshift, int_le, int_lt, int_mod, int_mul, int_ne, \
    7.13                     int_neg, int_not, int_or, int_pow, int_rshift, int_str, \
    7.14                     int_sub, int_xor
    7.15  
    7.16 +def new_int(number_or_string, base=10):
    7.17 +
    7.18 +    "Initialise the integer with the given 'number_or_string'."
    7.19 +
    7.20 +    if is_int(number_or_string):
    7.21 +        return number_or_string
    7.22 +    elif isinstance(number_or_string, basestring):
    7.23 +        return str_to_int(number_or_string, base)
    7.24 +    else:
    7.25 +        raise TypeError
    7.26 +
    7.27 +def str_to_int(value, base=10):
    7.28 +
    7.29 +    "Decode the string 'value' using the given 'base'."
    7.30 +
    7.31 +    # NOTE: Add support for lower and upper in the string classes.
    7.32 +
    7.33 +    #value = value.lower()
    7.34 +    len_value = len(value)
    7.35 +    digits = "0123456789abcdefghijklmnopqrstuvwxyz"
    7.36 +
    7.37 +    result = 0
    7.38 +    i = 0
    7.39 +
    7.40 +    while i < len_value:
    7.41 +        c = value[i]
    7.42 +        d = digits.index(c)
    7.43 +        result = result * base + d
    7.44 +        i += 1
    7.45 +
    7.46 +    return result
    7.47 +
    7.48  class int:
    7.49  
    7.50      "An integer abstraction."
    7.51  
    7.52 -    def __init__(self, number_or_string=None):
    7.53 +    def __init__(self, number_or_string=None, base=10):
    7.54  
    7.55          "Initialise the integer with the given 'number_or_string'."
    7.56  
    7.57 -        # Implemented in the translator.
    7.58 +        # Implemented by new_int above, invoked specially by the translator.
    7.59  
    7.60          pass
    7.61  
    7.62 @@ -245,7 +278,7 @@
    7.63  
    7.64          "Return a string representation."
    7.65  
    7.66 -        return utf8string(int_str(self))
    7.67 +        return unicode(int_str(self))
    7.68  
    7.69      __repr__ = __str__
    7.70  
     8.1 --- a/lib/__builtins__/stream.py	Sun Jun 27 22:14:51 2021 +0200
     8.2 +++ b/lib/__builtins__/stream.py	Tue Jun 29 22:24:48 2021 +0200
     8.3 @@ -144,7 +144,7 @@
     8.4          # Encode text as bytes if necessary. When the encoding is not set, any
     8.5          # original encoding of the text will be applied.
     8.6  
     8.7 -        if _isinstance(s, utf8string):
     8.8 +        if _isinstance(s, unicode):
     8.9              s = s.encode(self.encoding)
    8.10  
    8.11          fwrite(self.__data__, s)
     9.1 --- a/lib/__builtins__/unicode.py	Sun Jun 27 22:14:51 2021 +0200
     9.2 +++ b/lib/__builtins__/unicode.py	Tue Jun 29 22:24:48 2021 +0200
     9.3 @@ -3,7 +3,7 @@
     9.4  """
     9.5  Unicode objects.
     9.6  
     9.7 -Copyright (C) 2015, 2016, 2017 Paul Boddie <paul@boddie.org.uk>
     9.8 +Copyright (C) 2015, 2016, 2017, 2021 Paul Boddie <paul@boddie.org.uk>
     9.9  
    9.10  This program is free software; you can redistribute it and/or modify it under
    9.11  the terms of the GNU General Public License as published by the Free Software
    9.12 @@ -25,21 +25,58 @@
    9.13  from native import str_add, unicode_len, unicode_ord, unicode_substr, \
    9.14                     isinstance as _isinstance
    9.15  
    9.16 -class utf8string(basestring):
    9.17 +class unicode(basestring):
    9.18  
    9.19      "A character string representation based on UTF-8."
    9.20  
    9.21 -    def __init__(self, other=None, encoding=None):
    9.22 +    def __init__(self, s, encoding=None, original=None):
    9.23  
    9.24          """
    9.25 -        Initialise the string, perhaps from 'other', with any original
    9.26 -        'encoding' indicated.
    9.27 +        Initialise the string from 'other', employing any indicated 'encoding'
    9.28 +        for the provided string data.
    9.29 +
    9.30 +        If 'original' is indicated, this may be used to override the original
    9.31 +        encoding. This is useful when the string data is already in UTF-8
    9.32 +        format, but where the original encoding needs to be communicated.
    9.33          """
    9.34  
    9.35 -        get_using(basestring.__init__, self)(other)
    9.36 -        self.encoding = encoding
    9.37          self.length = None
    9.38  
    9.39 +        # Initialise using another Unicode object.
    9.40 +
    9.41 +        if _isinstance(s, unicode):
    9.42 +            get_using(basestring.__init__, self)(s)
    9.43 +            self.encoding = s.encoding
    9.44 +
    9.45 +        # Initialise using suitable string data but with an explicit original
    9.46 +        # encoding.
    9.47 +
    9.48 +        elif original:
    9.49 +            get_using(basestring.__init__, self)(s)
    9.50 +            self.encoding = original
    9.51 +
    9.52 +        # Initialise using string data having either UTF-8 or another encoding,
    9.53 +        # converting to UTF-8 and retaining the encoding details as the original
    9.54 +        # encoding.
    9.55 +
    9.56 +        else:
    9.57 +            # Obtain a string representation.
    9.58 +
    9.59 +            s = s.__str__()
    9.60 +
    9.61 +            # Convert the string to UTF-8. Even if the stated encoding is UTF-8, it
    9.62 +            # needs to be validated.
    9.63 +
    9.64 +            to_utf8 = Converter(encoding or "UTF-8", "UTF-8")
    9.65 +
    9.66 +            try:
    9.67 +                to_utf8.feed(s)
    9.68 +                get_using(basestring.__init__, self)(str(to_utf8))
    9.69 +            finally:
    9.70 +                to_utf8.close()
    9.71 +
    9.72 +            self.encoding = encoding
    9.73 +
    9.74      def _binary_op(self, op, other, sizes=False):
    9.75  
    9.76          "Perform 'op' on this object and 'other' if appropriate."
    9.77 @@ -51,7 +88,7 @@
    9.78  
    9.79          # Combining text with bytes.
    9.80  
    9.81 -        if not _isinstance(other, utf8string):
    9.82 +        if not _isinstance(other, unicode):
    9.83              s = self.encode()
    9.84          else:
    9.85              s = self
    9.86 @@ -72,7 +109,7 @@
    9.87  
    9.88          # Combining text with bytes.
    9.89  
    9.90 -        if not _isinstance(other, utf8string):
    9.91 +        if not _isinstance(other, unicode):
    9.92              s = self.encode()
    9.93          else:
    9.94              s = self
    9.95 @@ -86,8 +123,8 @@
    9.96  
    9.97          "Convert 'result' to a Unicode object if 'other' already is."
    9.98  
    9.99 -        if _isinstance(other, utf8string):
   9.100 -            return utf8string(result, self.encoding)
   9.101 +        if _isinstance(other, unicode):
   9.102 +            return unicode(result, None, self.encoding)
   9.103          else:
   9.104              return result
   9.105  
   9.106 @@ -188,15 +225,14 @@
   9.107              elif nonempty:
   9.108                  b.append(self)
   9.109  
   9.110 -            if _isinstance(s, utf8string):
   9.111 +            if _isinstance(s, unicode):
   9.112                  encoding = None
   9.113  
   9.114              b.append(s)
   9.115  
   9.116          s = str(b)
   9.117          if encoding:
   9.118 -            s = utf8string(s)
   9.119 -            s.encoding = encoding
   9.120 +            s = unicode(s, None, encoding)
   9.121          return s
   9.122  
   9.123      # Special implementation methods.
   9.124 @@ -204,9 +240,9 @@
   9.125      def __get_single_item__(self, index):
   9.126      
   9.127          "Return the item at the normalised (positive) 'index'."
   9.128 -    
   9.129 + 
   9.130          self._check_index(index)
   9.131 -        return utf8string(unicode_substr(self.__data__, self.__size__, index, index + 1, 1), self.encoding)
   9.132 +        return unicode(unicode_substr(self.__data__, self.__size__, index, index + 1, 1), None, self.encoding)
   9.133  
   9.134      def __get_multiple_items__(self, start, end, step):
   9.135  
   9.136 @@ -224,29 +260,6 @@
   9.137              raise ValueError(step)
   9.138  
   9.139          l = get_using(basestring.__get_multiple_items__, self)(start, end, step)
   9.140 -        return utf8string("".join(l), self.encoding)
   9.141 -
   9.142 -def unicode(s, encoding):
   9.143 -
   9.144 -    "Convert 's' to a Unicode object, interpreting 's' as using 'encoding'."
   9.145 -
   9.146 -    if isinstance(s, utf8string):
   9.147 -        return s
   9.148 -
   9.149 -    # Obtain a string representation.
   9.150 -
   9.151 -    s = s.__str__()
   9.152 -
   9.153 -    # Convert the string to UTF-8. Even if the stated encoding is UTF-8, it
   9.154 -    # needs to be validated.
   9.155 -
   9.156 -    to_utf8 = Converter(encoding, "UTF-8")
   9.157 -
   9.158 -    try:
   9.159 -        to_utf8.feed(s)
   9.160 -        return utf8string(str(to_utf8), encoding)
   9.161 -
   9.162 -    finally:
   9.163 -        to_utf8.close()
   9.164 +        return unicode("".join(l), None, self.encoding)
   9.165  
   9.166  # vim: tabstop=4 expandtab shiftwidth=4
    10.1 --- a/tests/int.py	Sun Jun 27 22:14:51 2021 +0200
    10.2 +++ b/tests/int.py	Tue Jun 29 22:24:48 2021 +0200
    10.3 @@ -11,3 +11,20 @@
    10.4      a = int("a")        # should raise an exception
    10.5  except ValueError, exc:
    10.6      print 'int("a") failed:', exc.value
    10.7 +
    10.8 +try:
    10.9 +    a = int("!")        # should raise an exception
   10.10 +except ValueError, exc:
   10.11 +    print 'int("!") failed:', exc.value
   10.12 +
   10.13 +a = int("a", 16)
   10.14 +b = int("123")
   10.15 +print a                 # 10
   10.16 +print b, i, b == i      # 123, 123, True
   10.17 +print b, j, b == j      # 123, 123, True
   10.18 +
   10.19 +a_is_int = isinstance(a, int)
   10.20 +j_is_int = isinstance(j, int)
   10.21 +
   10.22 +print a_is_int          # True
   10.23 +print j_is_int          # True
    11.1 --- a/tests/unicode.py	Sun Jun 27 22:14:51 2021 +0200
    11.2 +++ b/tests/unicode.py	Tue Jun 29 22:24:48 2021 +0200
    11.3 @@ -48,7 +48,7 @@
    11.4  s7 = r"\346\370\345"
    11.5  print "Untranslated values:"
    11.6  print s7                            # \346\370\345
    11.7 -print s7.__class__                  # __builtins__.unicode.utf8string
    11.8 +print s7.__class__                  # __builtins__.unicode.unicode
    11.9  print len(s7)                       # 12
   11.10  
   11.11  # Obtain text and print it.
   11.12 @@ -58,7 +58,7 @@
   11.13  u = unicode(b"זרו", "ISO-8859-15")
   11.14  print "Unicode values:"
   11.15  print u                             # זרו
   11.16 -print u.__class__                   # __builtins__.unicode.utf8string
   11.17 +print u.__class__                   # __builtins__.unicode.unicode
   11.18  print u.encode("ISO-8859-15")       # זרו
   11.19  print u.encoding                    # ISO-8859-15
   11.20  print len(u)                        # 3
   11.21 @@ -68,7 +68,7 @@
   11.22  u2 = u"זרו"
   11.23  print "Unicode values:"
   11.24  print u2                            # זרו
   11.25 -print u2.__class__                  # __builtins__.unicode.utf8string
   11.26 +print u2.__class__                  # __builtins__.unicode.unicode
   11.27  print u2.encode("ISO-8859-15")      # זרו
   11.28  print u2.encoding                   # ISO-8859-15
   11.29  print len(u2)                       # 3
   11.30 @@ -78,7 +78,7 @@
   11.31  u3 = "זרו"
   11.32  print "Unicode values:"
   11.33  print u3                            # זרו
   11.34 -print u3.__class__                  # __builtins__.unicode.utf8string
   11.35 +print u3.__class__                  # __builtins__.unicode.unicode
   11.36  print u3.encode("ISO-8859-15")      # זרו
   11.37  print u3.encoding                   # ISO-8859-15
   11.38  print len(u3)                       # 3
   11.39 @@ -88,7 +88,7 @@
   11.40  u4 = unicode("זרו", "ISO-8859-15")
   11.41  print "Unicode values:"
   11.42  print u4                            # זרו
   11.43 -print u4.__class__                  # __builtins__.unicode.utf8string
   11.44 +print u4.__class__                  # __builtins__.unicode.unicode
   11.45  print u4.encode("ISO-8859-15")      # זרו
   11.46  print u4.encoding                   # ISO-8859-15
   11.47  print len(u4)                       # 3
   11.48 @@ -163,7 +163,7 @@
   11.49  uu2 = u + u2
   11.50  print "Unicode values:"
   11.51  print uu2                           # זרוזרו
   11.52 -print uu2.__class__                 # __builtins__.unicode.utf8string
   11.53 +print uu2.__class__                 # __builtins__.unicode.unicode
   11.54  print uu2.encoding                  # ISO-8859-15
   11.55  print len(uu2)                      # 6
   11.56  
   11.57 @@ -195,7 +195,7 @@
   11.58  # Test character access.
   11.59  
   11.60  u0 = u[0]
   11.61 -print u0.__class__                  # __builtins__.unicode.utf8string
   11.62 +print u0.__class__                  # __builtins__.unicode.unicode
   11.63  print u0.encoding                   # ISO-8859-15
   11.64  print u0                            # ז
   11.65  print u[-1]                         # ו