imip-agent

Annotated imiptools/text.py

1370:b4544a1a80c1
2017-10-25 Paul Boddie Moved period collection abstractions into the period module.
paul@1031 1
#!/usr/bin/env python
paul@1031 2
paul@1031 3
"""
paul@1031 4
Parsing of textual content.
paul@1031 5
paul@1229 6
Copyright (C) 2014, 2015, 2016, 2017 Paul Boddie <paul@boddie.org.uk>
paul@1031 7
paul@1031 8
This program is free software; you can redistribute it and/or modify it under
paul@1031 9
the terms of the GNU General Public License as published by the Free Software
paul@1031 10
Foundation; either version 3 of the License, or (at your option) any later
paul@1031 11
version.
paul@1031 12
paul@1031 13
This program is distributed in the hope that it will be useful, but WITHOUT
paul@1031 14
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
paul@1031 15
FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
paul@1031 16
details.
paul@1031 17
paul@1031 18
You should have received a copy of the GNU General Public License along with
paul@1031 19
this program.  If not, see <http://www.gnu.org/licenses/>.
paul@1031 20
"""
paul@1031 21
paul@1236 22
from imiptools.filesys import fix_permissions
paul@1236 23
from os.path import isfile
paul@1174 24
import codecs
paul@1031 25
import re
paul@1031 26
paul@1236 27
def have_table(obj, filename):
paul@1236 28
paul@1236 29
    "Return whether 'obj' is a table using the given 'filename'."
paul@1236 30
paul@1236 31
    return hasattr(obj, "get_filename") and obj.get_filename() == filename
paul@1236 32
paul@1236 33
class FileTable:
paul@1236 34
paul@1236 35
    "A file-based data table."
paul@1236 36
paul@1236 37
    def __init__(self, filename, mutable=True,
paul@1236 38
                 in_defaults=None, out_defaults=None,
paul@1236 39
                 in_converter=None, out_converter=None,
paul@1236 40
                 tab_separated=True, headers=False):
paul@1236 41
paul@1236 42
        """
paul@1236 43
        Open the table from the file having the given 'filename'. If 'mutable'
paul@1236 44
        is given as a true value (as is the default), the table can be modified.
paul@1236 45
paul@1236 46
        The 'in_defaults' is a list of (index, value) tuples indicating the
paul@1236 47
        default value where a column either does not exist or provides an empty
paul@1236 48
        value. The 'out_defaults' is a corresponding list used to serialise
paul@1236 49
        missing and empty values.
paul@1236 50
paul@1236 51
        The 'in_converter' is a callable accepting a tuple of values and
paul@1236 52
        returning an object. The corresponding 'out_converter' accepts an object
paul@1236 53
        and returns a tuple of values.
paul@1236 54
paul@1236 55
        If 'tab_separated' is specified and is a false value, line parsing using
paul@1236 56
        the imiptools.text.parse_line function will be performed instead of
paul@1236 57
        splitting each line of the file using tab characters as separators.
paul@1236 58
paul@1236 59
        If 'headers' is specified and is not false, the first line in the table
paul@1236 60
        will provide header value information.
paul@1236 61
        """
paul@1236 62
paul@1236 63
        self.filename = filename
paul@1236 64
        self.mutable = mutable
paul@1236 65
        self.in_defaults = in_defaults
paul@1236 66
        self.out_defaults = out_defaults
paul@1236 67
        self.in_converter = in_converter
paul@1236 68
        self.out_converter = out_converter
paul@1236 69
        self.tab_separated = tab_separated
paul@1236 70
paul@1236 71
        # Obtain the items. In subsequent implementations, the items could be
paul@1236 72
        # retrieved dynamically.
paul@1236 73
paul@1236 74
        items = []
paul@1236 75
paul@1236 76
        if isfile(filename):
paul@1236 77
            for item in get_table(filename, in_defaults, tab_separated):
paul@1236 78
                if self.in_converter:
paul@1236 79
                    item = self.in_converter(item)
paul@1236 80
                items.append(item)
paul@1236 81
paul@1236 82
        # Obtain header values and separate them from the rest of the data.
paul@1236 83
paul@1236 84
        self.table = items[headers and 1 or 0:]
paul@1236 85
        self.header_values = headers and items and items[0] or []
paul@1236 86
        self.headers = headers
paul@1236 87
paul@1236 88
    def get_filename(self):
paul@1236 89
        return self.filename
paul@1236 90
paul@1236 91
    def get_header_values(self):
paul@1236 92
        return self.header_values
paul@1236 93
paul@1236 94
    def set_header_values(self, values):
paul@1236 95
        self.header_values = values
paul@1236 96
paul@1236 97
    def close(self):
paul@1031 98
paul@1236 99
        "Write any modifications and close the table."
paul@1236 100
paul@1236 101
        if self.mutable:
paul@1236 102
            f = codecs.open(self.filename, "wb", encoding="utf-8")
paul@1236 103
            try:
paul@1236 104
                sep = self.tab_separated and "\t" or " "
paul@1236 105
paul@1236 106
                # Include any headers in the output.
paul@1236 107
paul@1236 108
                if self.headers:
paul@1236 109
                    self.table.insert(0, self.header_values)
paul@1236 110
paul@1236 111
                for item in self.table:
paul@1236 112
                    if self.out_converter:
paul@1236 113
                        item = self.out_converter(item)
paul@1236 114
paul@1236 115
                    # Insert defaults for empty columns.
paul@1236 116
paul@1236 117
                    if self.out_defaults:
paul@1236 118
                        item = set_defaults(list(item), self.out_defaults)
paul@1236 119
paul@1236 120
                    # Separate the columns and write to the file.
paul@1236 121
paul@1236 122
                    print >>f, sep.join(item)
paul@1236 123
paul@1236 124
                # Remove the headers from the items in case the table is
paul@1236 125
                # accessed again.
paul@1236 126
paul@1236 127
                if self.headers:
paul@1236 128
                    del self.table[0]
paul@1236 129
paul@1236 130
            finally:
paul@1236 131
                f.close()
paul@1236 132
                fix_permissions(self.filename)
paul@1236 133
paul@1236 134
    # General collection methods.
paul@1229 135
paul@1236 136
    def __nonzero__(self):
paul@1236 137
        return bool(self.table)
paul@1236 138
paul@1236 139
    # List emulation methods.
paul@1236 140
paul@1236 141
    def __iadd__(self, other):
paul@1236 142
        for value in other:
paul@1236 143
            self.append(value)
paul@1236 144
        return self
paul@1236 145
paul@1236 146
    def __iter__(self):
paul@1236 147
        return iter(self.table)
paul@1236 148
paul@1236 149
    def __len__(self):
paul@1236 150
        return len(self.table)
paul@1031 151
paul@1236 152
    def __delitem__(self, i):
paul@1236 153
        del self.table[i]
paul@1236 154
paul@1236 155
    def __delslice__(self, start, end):
paul@1236 156
        del self.table[start:end]
paul@1236 157
paul@1236 158
    def __getitem__(self, i):
paul@1236 159
        return self.table[i]
paul@1236 160
paul@1236 161
    def __getslice__(self, start, end):
paul@1236 162
        return self.table[start:end]
paul@1236 163
paul@1236 164
    def __setitem__(self, i, value):
paul@1236 165
        self.table[i] = value
paul@1236 166
paul@1236 167
    def __setslice__(self, start, end, values):
paul@1236 168
        self.table[start:end] = values
paul@1236 169
paul@1236 170
    def append(self, value):
paul@1236 171
        self.table.append(value)
paul@1031 172
paul@1236 173
    def insert(self, i, value):
paul@1236 174
        self.table.insert(i, value)
paul@1236 175
paul@1236 176
    def remove(self, value):
paul@1236 177
        self.table.remove(value)
paul@1236 178
paul@1236 179
    # Dictionary emulation methods (even though this is not a mapping).
paul@1236 180
paul@1236 181
    def clear(self):
paul@1236 182
        del self.table[:]
paul@1236 183
paul@1236 184
    # Additional modification methods.
paul@1236 185
paul@1236 186
    def replaceall(self, values):
paul@1236 187
        self.table[:] = values
paul@1236 188
paul@1236 189
class FileTableDict(FileTable):
paul@1236 190
paul@1236 191
    "A file-based table acting as a dictionary."
paul@1236 192
paul@1236 193
    def __init__(self, filename, mutable=True,
paul@1236 194
                 in_defaults=None, out_defaults=None,
paul@1236 195
                 in_converter=None, out_converter=None,
paul@1236 196
                 tab_separated=True, headers=False):
paul@1236 197
paul@1236 198
        FileTable.__init__(self, filename, mutable, in_defaults, out_defaults,
paul@1236 199
                           in_converter, out_converter, tab_separated, headers)
paul@1236 200
        self.mapping = dict(self.table)
paul@1236 201
paul@1236 202
    def close(self):
paul@1236 203
        self.table = self.mapping.items()
paul@1236 204
        FileTable.close(self)
paul@1236 205
paul@1236 206
    # General collection methods.
paul@1229 207
paul@1236 208
    def __nonzero__(self):
paul@1236 209
        return bool(self.mapping)
paul@1236 210
paul@1236 211
    # List emulation methods.
paul@1236 212
paul@1236 213
    def __iter__(self):
paul@1236 214
        return iter(self.mapping)
paul@1236 215
paul@1236 216
    def __len__(self):
paul@1236 217
        return len(self.mapping)
paul@1236 218
paul@1236 219
    def append(self, value):
paul@1236 220
        key, value = value
paul@1236 221
        self.mapping[key] = value
paul@1236 222
paul@1236 223
    def insert(self, i, value):
paul@1236 224
        self.append(value)
paul@1236 225
paul@1236 226
    def remove(self, value):
paul@1236 227
        key, value = value
paul@1236 228
        del self.mapping[key]
paul@1236 229
paul@1236 230
    # Unimplemented methods.
paul@1236 231
paul@1236 232
    def __delslice__(self, start, end):
paul@1236 233
        raise NotImplementedError, "__delslice__"
paul@1236 234
paul@1236 235
    def __getslice__(self, start, end):
paul@1236 236
        raise NotImplementedError, "__getslice__"
paul@1236 237
paul@1236 238
    def __setslice__(self, start, end, values):
paul@1236 239
        raise NotImplementedError, "__setslice__"
paul@1236 240
paul@1236 241
    # Dictionary emulation methods.
paul@1236 242
paul@1236 243
    def clear(self):
paul@1236 244
        self.mapping.clear()
paul@1031 245
paul@1236 246
    def get(self, i, default=None):
paul@1236 247
        return self.mapping.get(i, default)
paul@1236 248
paul@1236 249
    def keys(self):
paul@1236 250
        return self.mapping.keys()
paul@1236 251
paul@1236 252
    def items(self):
paul@1236 253
        return self.mapping.items()
paul@1236 254
paul@1236 255
    def update(self, other):
paul@1236 256
        self.mapping.update(other)
paul@1236 257
paul@1236 258
    def values(self):
paul@1236 259
        return self.mapping.values()
paul@1236 260
paul@1236 261
    def __delitem__(self, i):
paul@1236 262
        del self.mapping[i]
paul@1031 263
paul@1236 264
    def __getitem__(self, i):
paul@1236 265
        return self.mapping[i]
paul@1236 266
paul@1236 267
    def __setitem__(self, i, value):
paul@1236 268
        if self.mutable:
paul@1236 269
            self.mapping[i] = value
paul@1236 270
paul@1236 271
    # Additional modification methods.
paul@1031 272
paul@1236 273
    def replaceall(self, values):
paul@1236 274
        self.mapping = {}
paul@1236 275
        self.mapping.update(dict(values))
paul@1236 276
paul@1236 277
    def updateall(self, mapping):
paul@1236 278
        self.mapping = {}
paul@1236 279
        self.mapping.update(mapping)
paul@1236 280
paul@1236 281
def first(t):
paul@1236 282
    return t[0]
paul@1031 283
paul@1236 284
def tuplevalue(v):
paul@1236 285
    return (v,)
paul@1236 286
paul@1236 287
class FileTableSingle(FileTable):
paul@1236 288
paul@1236 289
    "A file-based table providing single value items."
paul@1236 290
paul@1236 291
    def __iter__(self):
paul@1236 292
        return iter(self[:])
paul@1236 293
paul@1236 294
    def __getitem__(self, i):
paul@1236 295
        return self.table[i][0]
paul@1236 296
paul@1236 297
    def __getslice__(self, start, end):
paul@1236 298
        return map(first, self.table[start:end])
paul@1236 299
paul@1236 300
    def __setitem__(self, i, value):
paul@1236 301
        self.table[i] = [(value,)]
paul@1031 302
paul@1236 303
    def __setslice__(self, start, end, values):
paul@1236 304
        self.table[start:end] = map(tuplevalue, values)
paul@1236 305
paul@1236 306
    def append(self, value):
paul@1236 307
        self.table.append((value,))
paul@1236 308
paul@1236 309
    def insert(self, i, value):
paul@1236 310
        self.table.insert(i, (value,))
paul@1031 311
paul@1236 312
    def remove(self, value):
paul@1236 313
        self.table.remove((value,))
paul@1236 314
paul@1236 315
    # Additional modification methods.
paul@1236 316
paul@1236 317
    def replaceall(self, values):
paul@1236 318
        self.table[:] = map(tuplevalue, values)
paul@1236 319
paul@1236 320
paul@1031 321
paul@1174 322
# Parsing of tabular files.
paul@1174 323
paul@1174 324
def set_defaults(t, empty_defaults):
paul@1174 325
paul@1174 326
    """
paul@1174 327
    In the list 't', replace values that are empty or absent with defaults
paul@1174 328
    provided by the 'empty_defaults' collection whose entries are of the form
paul@1174 329
    (index, value).
paul@1174 330
    """
paul@1174 331
paul@1174 332
    for i, default in empty_defaults:
paul@1174 333
        if i >= len(t):
paul@1174 334
            t += [None] * (i - len(t) + 1)
paul@1174 335
        if not t[i]:
paul@1174 336
            t[i] = default
paul@1174 337
    return t
paul@1174 338
paul@1174 339
def get_table(filename, empty_defaults=None, tab_separated=True):
paul@1174 340
paul@1174 341
    """
paul@1174 342
    From the file having the given 'filename', return a list of tuples
paul@1174 343
    representing the file's contents.
paul@1174 344
paul@1174 345
    The 'empty_defaults' is a list of (index, value) tuples indicating the
paul@1174 346
    default value where a column either does not exist or provides an empty
paul@1174 347
    value.
paul@1174 348
paul@1174 349
    If 'tab_separated' is specified and is a false value, line parsing using
paul@1174 350
    the imiptools.text.parse_line function will be performed instead of
paul@1174 351
    splitting each line of the file using tab characters as separators.
paul@1174 352
    """
paul@1174 353
paul@1174 354
    f = codecs.open(filename, "rb", encoding="utf-8")
paul@1174 355
    try:
paul@1174 356
        return get_table_from_stream(f, empty_defaults, tab_separated)
paul@1174 357
    finally:
paul@1174 358
        f.close()
paul@1174 359
paul@1174 360
def get_table_from_stream(f, empty_defaults=None, tab_separated=True):
paul@1174 361
paul@1174 362
    """
paul@1174 363
    Return a list of tuples representing the contents of the stream 'f'.
paul@1174 364
paul@1174 365
    The 'empty_defaults' is a list of (index, value) tuples indicating the
paul@1174 366
    default value where a column either does not exist or provides an empty
paul@1174 367
    value.
paul@1174 368
paul@1174 369
    If 'tab_separated' is specified and is a false value, line parsing using
paul@1174 370
    the imiptools.text.parse_line function will be performed instead of
paul@1174 371
    splitting each line of the file using tab characters as separators.
paul@1174 372
    """
paul@1174 373
paul@1174 374
    l = []
paul@1174 375
paul@1174 376
    for line in f.readlines():
paul@1174 377
        line = line.strip(" \r\n")
paul@1174 378
paul@1174 379
        if tab_separated:
paul@1174 380
            t = line.split("\t")
paul@1174 381
        else:
paul@1174 382
            t = parse_line(line)
paul@1174 383
paul@1174 384
        if empty_defaults:
paul@1174 385
            t = set_defaults(t, empty_defaults)
paul@1174 386
        l.append(tuple(t))
paul@1174 387
paul@1174 388
    return l
paul@1174 389
paul@1236 390
paul@1236 391
paul@1236 392
# Parsing of lines to obtain functions and arguments.
paul@1236 393
paul@1236 394
line_pattern_str = (
paul@1236 395
                   r"(?:"
paul@1236 396
                   r"(?:'(.*?)')" # single-quoted text
paul@1236 397
                   r"|"
paul@1236 398
                   r'(?:"(.*?)")' # double-quoted text
paul@1236 399
                   r"|"
paul@1236 400
                   r"([^\s]+)"    # non-whitespace characters
paul@1236 401
                   r")+"
paul@1236 402
                   r"(?:\s+|$)"   # optional trailing whitespace before line end
paul@1236 403
                   )
paul@1236 404
paul@1236 405
line_pattern = re.compile(line_pattern_str)
paul@1236 406
paul@1236 407
def parse_line(text):
paul@1236 408
paul@1236 409
    """
paul@1236 410
    Parse the given 'text', returning a list of words separated by whitespace in
paul@1236 411
    the input, where whitespace may occur inside words if quoted using single or
paul@1236 412
    double quotes.
paul@1236 413
paul@1236 414
    Hello world     -> ['Hello', 'world']
paul@1236 415
    Hello ' world'  -> ['Hello', ' world']
paul@1236 416
    Hello' 'world   -> ["'Hello'", "'world']
paul@1236 417
    """
paul@1236 418
paul@1236 419
    parts = []
paul@1236 420
paul@1236 421
    # Match the components of each part.
paul@1236 422
paul@1236 423
    for match in line_pattern.finditer(text):
paul@1236 424
paul@1236 425
        # Combine the components by traversing the matching groups.
paul@1236 426
paul@1236 427
        parts.append(reduce(lambda a, b: (a or "") + (b or ""), match.groups()))
paul@1236 428
paul@1236 429
    return parts
paul@1236 430
paul@1031 431
# vim: tabstop=4 expandtab shiftwidth=4