Lichen (file lib/__builtins__/unicode.py at c07b0dd14f85)

     1 #!/usr/bin/env python     2      3 """     4 Unicode objects.     5      6 Copyright (C) 2015, 2016, 2017 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT    14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS    15 FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more    16 details.    17     18 You should have received a copy of the GNU General Public License along with    19 this program.  If not, see <http://www.gnu.org/licenses/>.    20 """    21     22 from __builtins__.str import basestring    23 from __builtins__.types import check_int    24 from posix.iconv import Converter    25 from native import str_add, unicode_len, unicode_ord, unicode_substr, \    26                    isinstance as _isinstance    27     28 class utf8string(basestring):    29     30     "A character string representation based on UTF-8."    31     32     def __init__(self, other=None, encoding=None):    33     34         """    35         Initialise the string, perhaps from 'other', with any original    36         'encoding' indicated.    37         """    38     39         get_using(basestring.__init__, self)(other)    40         self.encoding = encoding    41         self.length = None    42     43     def _binary_op(self, op, other, sizes=False):    44     45         "Perform 'op' on this object and 'other' if appropriate."    46     47         # Reject non-strings.    48     49         if not _isinstance(other, basestring):    50             return NotImplemented    51     52         # Combining text with bytes.    53     54         if not _isinstance(other, utf8string):    55             s = self.encode()    56         else:    57             s = self    58     59         if sizes:    60             return op(s.__data__, other.__data__, s.__size__, other.__size__)    61         else:    62             return op(s.__data__, other.__data__)    63     64     def _binary_op_rev(self, op, other, sizes=False):    65     66         "Perform 'op' on 'other' and this object if appropriate."    67     68         # Reject non-strings.    69     70         if not _isinstance(other, basestring):    71             return NotImplemented    72     73         # Combining text with bytes.    74     75         if not _isinstance(other, utf8string):    76             s = self.encode()    77         else:    78             s = self    79     80         if sizes:    81             return op(other.__data__, s.__data__, other.__size__, s.__size__)    82         else:    83             return op(other.__data__, s.__data__)    84     85     def _convert(self, result, other):    86     87         "Convert 'result' to a Unicode object if 'other' already is."    88     89         if _isinstance(other, utf8string):    90             return utf8string(result, self.encoding)    91         else:    92             return result    93     94     def _quote_value(self, b, n):    95     96         "Append to 'b' the quoted form of 'n'."    97     98         if n < 0:    99             n += 256   100    101         if n > 0xffff:   102             b.append("\\U")   103             digits = 8   104         else:   105             b.append("\\u")   106             digits = 4   107    108         x = hex(n, "")   109         i = len(x)   110    111         while i < digits:   112             b.append("0")   113             i += 1   114    115         b.append(x)   116    117     # Operator methods.   118    119     def __iadd__(self, other):   120    121         "Return a string combining this string with 'other'."   122    123         return self._convert(self._binary_op(str_add, other, True), other)   124    125     __add__ = __iadd__   126    127     def __radd__(self, other):   128    129         "Return a string combining this string with 'other'."   130    131         return self._convert(self._binary_op_rev(str_add, other, True), other)   132    133     def __len__(self):   134    135         "Return the length of this string in characters."   136    137         if self.length is None:   138             self.length = unicode_len(self.__data__, self.__size__)   139    140         return self.length   141    142     def __ord__(self):   143    144         "Return the value of the string, if only a single character."   145    146         if self.__len__() == 1:   147             return unicode_ord(self.__data__, self.__size__)   148         else:   149             raise ValueError, self   150    151     def encode(self, encoding=None):   152    153         """   154         Encode the string to the given 'encoding' or any original encoding if   155         omitted.   156         """   157    158         encoding = encoding or self.encoding   159         if not encoding:   160             return self   161    162         from_utf8 = Converter("UTF-8", encoding)   163    164         try:   165             from_utf8.feed(self)   166             return str(from_utf8)   167    168         finally:   169             from_utf8.close()   170    171     def join(self, l):   172    173         "Join the elements in 'l' with this string."   174    175         # Empty strings just cause the list elements to be concatenated.   176    177         nonempty = self.__bool__()   178    179         # Non-empty strings join the elements together in a buffer.   180    181         b = buffer()   182         first = True   183         encoding = self.encoding   184    185         for s in l:   186             if first:   187                 first = False   188             elif nonempty:   189                 b.append(self)   190    191             if _isinstance(s, utf8string):   192                 encoding = None   193    194             b.append(s)   195    196         s = str(b)   197         if encoding:   198             s = utf8string(s)   199             s.encoding = encoding   200         return s   201    202     # Special implementation methods.   203    204     def __get_single_item__(self, index):   205        206         "Return the item at the normalised (positive) 'index'."   207        208         self._check_index(index)   209         return utf8string(unicode_substr(self.__data__, self.__size__, index, index + 1, 1), self.encoding)   210    211     def __get_multiple_items__(self, start, end, step):   212    213         """   214         Return items from 'start' until (but excluding) 'end', at 'step'   215         intervals.   216         """   217    218         if start == end:   219             return ""   220    221         check_int(step)   222    223         if step == 0:   224             raise ValueError(step)   225    226         l = get_using(basestring.__get_multiple_items__, self)(start, end, step)   227         return utf8string("".join(l), self.encoding)   228    229 def unicode(s, encoding):   230    231     "Convert 's' to a Unicode object, interpreting 's' as using 'encoding'."   232    233     if isinstance(s, utf8string):   234         return s   235    236     # Obtain a string representation.   237    238     s = s.__str__()   239    240     # Convert the string to UTF-8. Even if the stated encoding is UTF-8, it   241     # needs to be validated.   242    243     to_utf8 = Converter(encoding, "UTF-8")   244    245     try:   246         to_utf8.feed(s)   247         return utf8string(str(to_utf8), encoding)   248    249     finally:   250         to_utf8.close()   251    252 # vim: tabstop=4 expandtab shiftwidth=4