Lichen (file lib/__builtins__/unicode.py at d6ffe931de37)

     1 #!/usr/bin/env python     2      3 """     4 Unicode objects.     5      6 Copyright (C) 2015, 2016, 2017, 2021 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT    14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS    15 FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more    16 details.    17     18 You should have received a copy of the GNU General Public License along with    19 this program.  If not, see <http://www.gnu.org/licenses/>.    20 """    21     22 from __builtins__.str import basestring    23 from __builtins__.types import check_int    24 from posix.iconv import Converter    25 from native import str_add, unicode_len, unicode_ord, unicode_substr, \    26                    isinstance as _isinstance    27     28 class unicode(basestring):    29     30     "A character string representation based on UTF-8."    31     32     def __init__(self, s, encoding=None, original=None):    33     34         """    35         Initialise the string from 'other', employing any indicated 'encoding'    36         for the provided string data.    37     38         If 'original' is indicated, this may be used to override the original    39         encoding. This is useful when the string data is already in UTF-8    40         format, but where the original encoding needs to be communicated.    41         """    42     43         self.length = None    44     45         # Initialise using another Unicode object.    46     47         if _isinstance(s, unicode):    48             get_using(basestring.__init__, self)(s)    49             self.encoding = s.encoding    50     51         # Initialise using suitable string data but with an explicit original    52         # encoding.    53     54         elif original:    55             get_using(basestring.__init__, self)(s)    56             self.encoding = original    57     58         # Initialise using string data having either UTF-8 or another encoding,    59         # converting to UTF-8 and retaining the encoding details as the original    60         # encoding.    61     62         else:    63             # Obtain a string representation.    64     65             s = s.__str__()    66     67             # Convert the string to UTF-8. Even if the stated encoding is UTF-8, it    68             # needs to be validated.    69     70             to_utf8 = Converter(encoding or "UTF-8", "UTF-8")    71     72             try:    73                 to_utf8.feed(s)    74                 get_using(basestring.__init__, self)(str(to_utf8))    75             finally:    76                 to_utf8.close()    77     78             self.encoding = encoding    79     80     def _binary_op(self, op, other, sizes=False):    81     82         "Perform 'op' on this object and 'other' if appropriate."    83     84         # Reject non-strings.    85     86         if not _isinstance(other, basestring):    87             return NotImplemented    88     89         # Combining text with bytes.    90     91         if not _isinstance(other, unicode):    92             s = self.encode()    93         else:    94             s = self    95     96         if sizes:    97             return op(s.__data__, other.__data__, s.__size__, other.__size__)    98         else:    99             return op(s.__data__, other.__data__)   100    101     def _binary_op_rev(self, op, other, sizes=False):   102    103         "Perform 'op' on 'other' and this object if appropriate."   104    105         # Reject non-strings.   106    107         if not _isinstance(other, basestring):   108             return NotImplemented   109    110         # Combining text with bytes.   111    112         if not _isinstance(other, unicode):   113             s = self.encode()   114         else:   115             s = self   116    117         if sizes:   118             return op(other.__data__, s.__data__, other.__size__, s.__size__)   119         else:   120             return op(other.__data__, s.__data__)   121    122     def _convert(self, result, other):   123    124         "Convert 'result' to a Unicode object if 'other' already is."   125    126         if _isinstance(other, unicode):   127             return unicode(result, None, self.encoding)   128         else:   129             return result   130    131     def _quote_value(self, b, n):   132    133         "Append to 'b' the quoted form of 'n'."   134    135         if n < 0:   136             n += 256   137    138         if n > 0xffff:   139             b.append("\\U")   140             digits = 8   141         else:   142             b.append("\\u")   143             digits = 4   144    145         x = hex(n, "")   146         i = len(x)   147    148         while i < digits:   149             b.append("0")   150             i += 1   151    152         b.append(x)   153    154     # Operator methods.   155    156     def __iadd__(self, other):   157    158         "Return a string combining this string with 'other'."   159    160         return self._convert(self._binary_op(str_add, other, True), other)   161    162     __add__ = __iadd__   163    164     def __radd__(self, other):   165    166         "Return a string combining this string with 'other'."   167    168         return self._convert(self._binary_op_rev(str_add, other, True), other)   169    170     def __len__(self):   171    172         "Return the length of this string in characters."   173    174         if self.length is None:   175             self.length = unicode_len(self.__data__, self.__size__)   176    177         return self.length   178    179     def __ord__(self):   180    181         "Return the value of the string, if only a single character."   182    183         if self.__len__() == 1:   184             return unicode_ord(self.__data__, self.__size__)   185         else:   186             raise ValueError, self   187    188     def encode(self, encoding=None):   189    190         """   191         Encode the string to the given 'encoding' or any original encoding if   192         omitted.   193         """   194    195         encoding = encoding or self.encoding   196         if not encoding:   197             return self   198    199         from_utf8 = Converter("UTF-8", encoding)   200    201         try:   202             from_utf8.feed(self)   203             return str(from_utf8)   204    205         finally:   206             from_utf8.close()   207    208     def join(self, l):   209    210         "Join the elements in 'l' with this string."   211    212         # Empty strings just cause the list elements to be concatenated.   213    214         nonempty = self.__bool__()   215    216         # Non-empty strings join the elements together in a buffer.   217    218         b = buffer()   219         first = True   220         encoding = self.encoding   221    222         for s in l:   223             if first:   224                 first = False   225             elif nonempty:   226                 b.append(self)   227    228             if _isinstance(s, unicode):   229                 encoding = None   230    231             b.append(s)   232    233         s = str(b)   234         if encoding:   235             s = unicode(s, None, encoding)   236         return s   237    238     # Special implementation methods.   239    240     def __get_single_item__(self, index):   241        242         "Return the item at the normalised (positive) 'index'."   243     244         self._check_index(index)   245         return unicode(unicode_substr(self.__data__, self.__size__, index, index + 1, 1), None, self.encoding)   246    247     def __get_multiple_items__(self, start, end, step):   248    249         """   250         Return items from 'start' until (but excluding) 'end', at 'step'   251         intervals.   252         """   253    254         if start == end:   255             return ""   256    257         check_int(step)   258    259         if step == 0:   260             raise ValueError(step)   261    262         l = get_using(basestring.__get_multiple_items__, self)(start, end, step)   263         return unicode("".join(l), None, self.encoding)   264    265 # vim: tabstop=4 expandtab shiftwidth=4