paul@1031 | 1 | #!/usr/bin/env python |
paul@1031 | 2 | |
paul@1031 | 3 | """ |
paul@1031 | 4 | Parsing of textual content. |
paul@1031 | 5 | |
paul@1229 | 6 | Copyright (C) 2014, 2015, 2016, 2017 Paul Boddie <paul@boddie.org.uk> |
paul@1031 | 7 | |
paul@1031 | 8 | This program is free software; you can redistribute it and/or modify it under |
paul@1031 | 9 | the terms of the GNU General Public License as published by the Free Software |
paul@1031 | 10 | Foundation; either version 3 of the License, or (at your option) any later |
paul@1031 | 11 | version. |
paul@1031 | 12 | |
paul@1031 | 13 | This program is distributed in the hope that it will be useful, but WITHOUT |
paul@1031 | 14 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
paul@1031 | 15 | FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
paul@1031 | 16 | details. |
paul@1031 | 17 | |
paul@1031 | 18 | You should have received a copy of the GNU General Public License along with |
paul@1031 | 19 | this program. If not, see <http://www.gnu.org/licenses/>. |
paul@1031 | 20 | """ |
paul@1031 | 21 | |
paul@1236 | 22 | from imiptools.filesys import fix_permissions |
paul@1236 | 23 | from os.path import isfile |
paul@1174 | 24 | import codecs |
paul@1031 | 25 | import re |
paul@1031 | 26 | |
paul@1236 | 27 | def have_table(obj, filename): |
paul@1236 | 28 | |
paul@1236 | 29 | "Return whether 'obj' is a table using the given 'filename'." |
paul@1236 | 30 | |
paul@1236 | 31 | return hasattr(obj, "get_filename") and obj.get_filename() == filename |
paul@1236 | 32 | |
paul@1236 | 33 | class FileTable: |
paul@1236 | 34 | |
paul@1236 | 35 | "A file-based data table." |
paul@1236 | 36 | |
paul@1236 | 37 | def __init__(self, filename, mutable=True, |
paul@1236 | 38 | in_defaults=None, out_defaults=None, |
paul@1236 | 39 | in_converter=None, out_converter=None, |
paul@1236 | 40 | tab_separated=True, headers=False): |
paul@1236 | 41 | |
paul@1236 | 42 | """ |
paul@1236 | 43 | Open the table from the file having the given 'filename'. If 'mutable' |
paul@1236 | 44 | is given as a true value (as is the default), the table can be modified. |
paul@1236 | 45 | |
paul@1236 | 46 | The 'in_defaults' is a list of (index, value) tuples indicating the |
paul@1236 | 47 | default value where a column either does not exist or provides an empty |
paul@1236 | 48 | value. The 'out_defaults' is a corresponding list used to serialise |
paul@1236 | 49 | missing and empty values. |
paul@1236 | 50 | |
paul@1236 | 51 | The 'in_converter' is a callable accepting a tuple of values and |
paul@1236 | 52 | returning an object. The corresponding 'out_converter' accepts an object |
paul@1236 | 53 | and returns a tuple of values. |
paul@1236 | 54 | |
paul@1236 | 55 | If 'tab_separated' is specified and is a false value, line parsing using |
paul@1236 | 56 | the imiptools.text.parse_line function will be performed instead of |
paul@1236 | 57 | splitting each line of the file using tab characters as separators. |
paul@1236 | 58 | |
paul@1236 | 59 | If 'headers' is specified and is not false, the first line in the table |
paul@1236 | 60 | will provide header value information. |
paul@1236 | 61 | """ |
paul@1236 | 62 | |
paul@1236 | 63 | self.filename = filename |
paul@1236 | 64 | self.mutable = mutable |
paul@1236 | 65 | self.in_defaults = in_defaults |
paul@1236 | 66 | self.out_defaults = out_defaults |
paul@1236 | 67 | self.in_converter = in_converter |
paul@1236 | 68 | self.out_converter = out_converter |
paul@1236 | 69 | self.tab_separated = tab_separated |
paul@1236 | 70 | |
paul@1236 | 71 | # Obtain the items. In subsequent implementations, the items could be |
paul@1236 | 72 | # retrieved dynamically. |
paul@1236 | 73 | |
paul@1236 | 74 | items = [] |
paul@1236 | 75 | |
paul@1236 | 76 | if isfile(filename): |
paul@1236 | 77 | for item in get_table(filename, in_defaults, tab_separated): |
paul@1236 | 78 | if self.in_converter: |
paul@1236 | 79 | item = self.in_converter(item) |
paul@1236 | 80 | items.append(item) |
paul@1236 | 81 | |
paul@1236 | 82 | # Obtain header values and separate them from the rest of the data. |
paul@1236 | 83 | |
paul@1236 | 84 | self.table = items[headers and 1 or 0:] |
paul@1236 | 85 | self.header_values = headers and items and items[0] or [] |
paul@1236 | 86 | self.headers = headers |
paul@1236 | 87 | |
paul@1236 | 88 | def get_filename(self): |
paul@1236 | 89 | return self.filename |
paul@1236 | 90 | |
paul@1236 | 91 | def get_header_values(self): |
paul@1236 | 92 | return self.header_values |
paul@1236 | 93 | |
paul@1236 | 94 | def set_header_values(self, values): |
paul@1236 | 95 | self.header_values = values |
paul@1236 | 96 | |
paul@1236 | 97 | def close(self): |
paul@1031 | 98 | |
paul@1236 | 99 | "Write any modifications and close the table." |
paul@1236 | 100 | |
paul@1236 | 101 | if self.mutable: |
paul@1236 | 102 | f = codecs.open(self.filename, "wb", encoding="utf-8") |
paul@1236 | 103 | try: |
paul@1236 | 104 | sep = self.tab_separated and "\t" or " " |
paul@1236 | 105 | |
paul@1236 | 106 | # Include any headers in the output. |
paul@1236 | 107 | |
paul@1236 | 108 | if self.headers: |
paul@1236 | 109 | self.table.insert(0, self.header_values) |
paul@1236 | 110 | |
paul@1236 | 111 | for item in self.table: |
paul@1236 | 112 | if self.out_converter: |
paul@1236 | 113 | item = self.out_converter(item) |
paul@1236 | 114 | |
paul@1236 | 115 | # Insert defaults for empty columns. |
paul@1236 | 116 | |
paul@1236 | 117 | if self.out_defaults: |
paul@1236 | 118 | item = set_defaults(list(item), self.out_defaults) |
paul@1236 | 119 | |
paul@1236 | 120 | # Separate the columns and write to the file. |
paul@1236 | 121 | |
paul@1236 | 122 | print >>f, sep.join(item) |
paul@1236 | 123 | |
paul@1236 | 124 | # Remove the headers from the items in case the table is |
paul@1236 | 125 | # accessed again. |
paul@1236 | 126 | |
paul@1236 | 127 | if self.headers: |
paul@1236 | 128 | del self.table[0] |
paul@1236 | 129 | |
paul@1236 | 130 | finally: |
paul@1236 | 131 | f.close() |
paul@1236 | 132 | fix_permissions(self.filename) |
paul@1236 | 133 | |
paul@1236 | 134 | # General collection methods. |
paul@1229 | 135 | |
paul@1236 | 136 | def __nonzero__(self): |
paul@1236 | 137 | return bool(self.table) |
paul@1236 | 138 | |
paul@1236 | 139 | # List emulation methods. |
paul@1236 | 140 | |
paul@1236 | 141 | def __iadd__(self, other): |
paul@1236 | 142 | for value in other: |
paul@1236 | 143 | self.append(value) |
paul@1236 | 144 | return self |
paul@1236 | 145 | |
paul@1236 | 146 | def __iter__(self): |
paul@1236 | 147 | return iter(self.table) |
paul@1236 | 148 | |
paul@1236 | 149 | def __len__(self): |
paul@1236 | 150 | return len(self.table) |
paul@1031 | 151 | |
paul@1236 | 152 | def __delitem__(self, i): |
paul@1236 | 153 | del self.table[i] |
paul@1236 | 154 | |
paul@1236 | 155 | def __delslice__(self, start, end): |
paul@1236 | 156 | del self.table[start:end] |
paul@1236 | 157 | |
paul@1236 | 158 | def __getitem__(self, i): |
paul@1236 | 159 | return self.table[i] |
paul@1236 | 160 | |
paul@1236 | 161 | def __getslice__(self, start, end): |
paul@1236 | 162 | return self.table[start:end] |
paul@1236 | 163 | |
paul@1236 | 164 | def __setitem__(self, i, value): |
paul@1236 | 165 | self.table[i] = value |
paul@1236 | 166 | |
paul@1236 | 167 | def __setslice__(self, start, end, values): |
paul@1236 | 168 | self.table[start:end] = values |
paul@1236 | 169 | |
paul@1236 | 170 | def append(self, value): |
paul@1236 | 171 | self.table.append(value) |
paul@1031 | 172 | |
paul@1236 | 173 | def insert(self, i, value): |
paul@1236 | 174 | self.table.insert(i, value) |
paul@1236 | 175 | |
paul@1236 | 176 | def remove(self, value): |
paul@1236 | 177 | self.table.remove(value) |
paul@1236 | 178 | |
paul@1236 | 179 | # Dictionary emulation methods (even though this is not a mapping). |
paul@1236 | 180 | |
paul@1236 | 181 | def clear(self): |
paul@1236 | 182 | del self.table[:] |
paul@1236 | 183 | |
paul@1236 | 184 | # Additional modification methods. |
paul@1236 | 185 | |
paul@1236 | 186 | def replaceall(self, values): |
paul@1236 | 187 | self.table[:] = values |
paul@1236 | 188 | |
paul@1236 | 189 | class FileTableDict(FileTable): |
paul@1236 | 190 | |
paul@1236 | 191 | "A file-based table acting as a dictionary." |
paul@1236 | 192 | |
paul@1236 | 193 | def __init__(self, filename, mutable=True, |
paul@1236 | 194 | in_defaults=None, out_defaults=None, |
paul@1236 | 195 | in_converter=None, out_converter=None, |
paul@1236 | 196 | tab_separated=True, headers=False): |
paul@1236 | 197 | |
paul@1236 | 198 | FileTable.__init__(self, filename, mutable, in_defaults, out_defaults, |
paul@1236 | 199 | in_converter, out_converter, tab_separated, headers) |
paul@1236 | 200 | self.mapping = dict(self.table) |
paul@1236 | 201 | |
paul@1236 | 202 | def close(self): |
paul@1236 | 203 | self.table = self.mapping.items() |
paul@1236 | 204 | FileTable.close(self) |
paul@1236 | 205 | |
paul@1236 | 206 | # General collection methods. |
paul@1229 | 207 | |
paul@1236 | 208 | def __nonzero__(self): |
paul@1236 | 209 | return bool(self.mapping) |
paul@1236 | 210 | |
paul@1236 | 211 | # List emulation methods. |
paul@1236 | 212 | |
paul@1236 | 213 | def __iter__(self): |
paul@1236 | 214 | return iter(self.mapping) |
paul@1236 | 215 | |
paul@1236 | 216 | def __len__(self): |
paul@1236 | 217 | return len(self.mapping) |
paul@1236 | 218 | |
paul@1236 | 219 | def append(self, value): |
paul@1236 | 220 | key, value = value |
paul@1236 | 221 | self.mapping[key] = value |
paul@1236 | 222 | |
paul@1236 | 223 | def insert(self, i, value): |
paul@1236 | 224 | self.append(value) |
paul@1236 | 225 | |
paul@1236 | 226 | def remove(self, value): |
paul@1236 | 227 | key, value = value |
paul@1236 | 228 | del self.mapping[key] |
paul@1236 | 229 | |
paul@1236 | 230 | # Unimplemented methods. |
paul@1236 | 231 | |
paul@1236 | 232 | def __delslice__(self, start, end): |
paul@1236 | 233 | raise NotImplementedError, "__delslice__" |
paul@1236 | 234 | |
paul@1236 | 235 | def __getslice__(self, start, end): |
paul@1236 | 236 | raise NotImplementedError, "__getslice__" |
paul@1236 | 237 | |
paul@1236 | 238 | def __setslice__(self, start, end, values): |
paul@1236 | 239 | raise NotImplementedError, "__setslice__" |
paul@1236 | 240 | |
paul@1236 | 241 | # Dictionary emulation methods. |
paul@1236 | 242 | |
paul@1236 | 243 | def clear(self): |
paul@1236 | 244 | self.mapping.clear() |
paul@1031 | 245 | |
paul@1236 | 246 | def get(self, i, default=None): |
paul@1236 | 247 | return self.mapping.get(i, default) |
paul@1236 | 248 | |
paul@1236 | 249 | def keys(self): |
paul@1236 | 250 | return self.mapping.keys() |
paul@1236 | 251 | |
paul@1236 | 252 | def items(self): |
paul@1236 | 253 | return self.mapping.items() |
paul@1236 | 254 | |
paul@1236 | 255 | def update(self, other): |
paul@1236 | 256 | self.mapping.update(other) |
paul@1236 | 257 | |
paul@1236 | 258 | def values(self): |
paul@1236 | 259 | return self.mapping.values() |
paul@1236 | 260 | |
paul@1236 | 261 | def __delitem__(self, i): |
paul@1236 | 262 | del self.mapping[i] |
paul@1031 | 263 | |
paul@1236 | 264 | def __getitem__(self, i): |
paul@1236 | 265 | return self.mapping[i] |
paul@1236 | 266 | |
paul@1236 | 267 | def __setitem__(self, i, value): |
paul@1236 | 268 | if self.mutable: |
paul@1236 | 269 | self.mapping[i] = value |
paul@1236 | 270 | |
paul@1236 | 271 | # Additional modification methods. |
paul@1031 | 272 | |
paul@1236 | 273 | def replaceall(self, values): |
paul@1236 | 274 | self.mapping = {} |
paul@1236 | 275 | self.mapping.update(dict(values)) |
paul@1236 | 276 | |
paul@1236 | 277 | def updateall(self, mapping): |
paul@1236 | 278 | self.mapping = {} |
paul@1236 | 279 | self.mapping.update(mapping) |
paul@1236 | 280 | |
paul@1236 | 281 | def first(t): |
paul@1236 | 282 | return t[0] |
paul@1031 | 283 | |
paul@1236 | 284 | def tuplevalue(v): |
paul@1236 | 285 | return (v,) |
paul@1236 | 286 | |
paul@1236 | 287 | class FileTableSingle(FileTable): |
paul@1236 | 288 | |
paul@1236 | 289 | "A file-based table providing single value items." |
paul@1236 | 290 | |
paul@1236 | 291 | def __iter__(self): |
paul@1236 | 292 | return iter(self[:]) |
paul@1236 | 293 | |
paul@1236 | 294 | def __getitem__(self, i): |
paul@1236 | 295 | return self.table[i][0] |
paul@1236 | 296 | |
paul@1236 | 297 | def __getslice__(self, start, end): |
paul@1236 | 298 | return map(first, self.table[start:end]) |
paul@1236 | 299 | |
paul@1236 | 300 | def __setitem__(self, i, value): |
paul@1236 | 301 | self.table[i] = [(value,)] |
paul@1031 | 302 | |
paul@1236 | 303 | def __setslice__(self, start, end, values): |
paul@1236 | 304 | self.table[start:end] = map(tuplevalue, values) |
paul@1236 | 305 | |
paul@1236 | 306 | def append(self, value): |
paul@1236 | 307 | self.table.append((value,)) |
paul@1236 | 308 | |
paul@1236 | 309 | def insert(self, i, value): |
paul@1236 | 310 | self.table.insert(i, (value,)) |
paul@1031 | 311 | |
paul@1236 | 312 | def remove(self, value): |
paul@1236 | 313 | self.table.remove((value,)) |
paul@1236 | 314 | |
paul@1236 | 315 | # Additional modification methods. |
paul@1236 | 316 | |
paul@1236 | 317 | def replaceall(self, values): |
paul@1236 | 318 | self.table[:] = map(tuplevalue, values) |
paul@1236 | 319 | |
paul@1236 | 320 | |
paul@1031 | 321 | |
paul@1174 | 322 | # Parsing of tabular files. |
paul@1174 | 323 | |
paul@1174 | 324 | def set_defaults(t, empty_defaults): |
paul@1174 | 325 | |
paul@1174 | 326 | """ |
paul@1174 | 327 | In the list 't', replace values that are empty or absent with defaults |
paul@1174 | 328 | provided by the 'empty_defaults' collection whose entries are of the form |
paul@1174 | 329 | (index, value). |
paul@1174 | 330 | """ |
paul@1174 | 331 | |
paul@1174 | 332 | for i, default in empty_defaults: |
paul@1174 | 333 | if i >= len(t): |
paul@1174 | 334 | t += [None] * (i - len(t) + 1) |
paul@1174 | 335 | if not t[i]: |
paul@1174 | 336 | t[i] = default |
paul@1174 | 337 | return t |
paul@1174 | 338 | |
paul@1174 | 339 | def get_table(filename, empty_defaults=None, tab_separated=True): |
paul@1174 | 340 | |
paul@1174 | 341 | """ |
paul@1174 | 342 | From the file having the given 'filename', return a list of tuples |
paul@1174 | 343 | representing the file's contents. |
paul@1174 | 344 | |
paul@1174 | 345 | The 'empty_defaults' is a list of (index, value) tuples indicating the |
paul@1174 | 346 | default value where a column either does not exist or provides an empty |
paul@1174 | 347 | value. |
paul@1174 | 348 | |
paul@1174 | 349 | If 'tab_separated' is specified and is a false value, line parsing using |
paul@1174 | 350 | the imiptools.text.parse_line function will be performed instead of |
paul@1174 | 351 | splitting each line of the file using tab characters as separators. |
paul@1174 | 352 | """ |
paul@1174 | 353 | |
paul@1174 | 354 | f = codecs.open(filename, "rb", encoding="utf-8") |
paul@1174 | 355 | try: |
paul@1174 | 356 | return get_table_from_stream(f, empty_defaults, tab_separated) |
paul@1174 | 357 | finally: |
paul@1174 | 358 | f.close() |
paul@1174 | 359 | |
paul@1174 | 360 | def get_table_from_stream(f, empty_defaults=None, tab_separated=True): |
paul@1174 | 361 | |
paul@1174 | 362 | """ |
paul@1174 | 363 | Return a list of tuples representing the contents of the stream 'f'. |
paul@1174 | 364 | |
paul@1174 | 365 | The 'empty_defaults' is a list of (index, value) tuples indicating the |
paul@1174 | 366 | default value where a column either does not exist or provides an empty |
paul@1174 | 367 | value. |
paul@1174 | 368 | |
paul@1174 | 369 | If 'tab_separated' is specified and is a false value, line parsing using |
paul@1174 | 370 | the imiptools.text.parse_line function will be performed instead of |
paul@1174 | 371 | splitting each line of the file using tab characters as separators. |
paul@1174 | 372 | """ |
paul@1174 | 373 | |
paul@1174 | 374 | l = [] |
paul@1174 | 375 | |
paul@1174 | 376 | for line in f.readlines(): |
paul@1174 | 377 | line = line.strip(" \r\n") |
paul@1174 | 378 | |
paul@1174 | 379 | if tab_separated: |
paul@1174 | 380 | t = line.split("\t") |
paul@1174 | 381 | else: |
paul@1174 | 382 | t = parse_line(line) |
paul@1174 | 383 | |
paul@1174 | 384 | if empty_defaults: |
paul@1174 | 385 | t = set_defaults(t, empty_defaults) |
paul@1174 | 386 | l.append(tuple(t)) |
paul@1174 | 387 | |
paul@1174 | 388 | return l |
paul@1174 | 389 | |
paul@1236 | 390 | |
paul@1236 | 391 | |
paul@1236 | 392 | # Parsing of lines to obtain functions and arguments. |
paul@1236 | 393 | |
paul@1236 | 394 | line_pattern_str = ( |
paul@1236 | 395 | r"(?:" |
paul@1236 | 396 | r"(?:'(.*?)')" # single-quoted text |
paul@1236 | 397 | r"|" |
paul@1236 | 398 | r'(?:"(.*?)")' # double-quoted text |
paul@1236 | 399 | r"|" |
paul@1236 | 400 | r"([^\s]+)" # non-whitespace characters |
paul@1236 | 401 | r")+" |
paul@1236 | 402 | r"(?:\s+|$)" # optional trailing whitespace before line end |
paul@1236 | 403 | ) |
paul@1236 | 404 | |
paul@1236 | 405 | line_pattern = re.compile(line_pattern_str) |
paul@1236 | 406 | |
paul@1236 | 407 | def parse_line(text): |
paul@1236 | 408 | |
paul@1236 | 409 | """ |
paul@1236 | 410 | Parse the given 'text', returning a list of words separated by whitespace in |
paul@1236 | 411 | the input, where whitespace may occur inside words if quoted using single or |
paul@1236 | 412 | double quotes. |
paul@1236 | 413 | |
paul@1236 | 414 | Hello world -> ['Hello', 'world'] |
paul@1236 | 415 | Hello ' world' -> ['Hello', ' world'] |
paul@1236 | 416 | Hello' 'world -> ["'Hello'", "'world'] |
paul@1236 | 417 | """ |
paul@1236 | 418 | |
paul@1236 | 419 | parts = [] |
paul@1236 | 420 | |
paul@1236 | 421 | # Match the components of each part. |
paul@1236 | 422 | |
paul@1236 | 423 | for match in line_pattern.finditer(text): |
paul@1236 | 424 | |
paul@1236 | 425 | # Combine the components by traversing the matching groups. |
paul@1236 | 426 | |
paul@1236 | 427 | parts.append(reduce(lambda a, b: (a or "") + (b or ""), match.groups())) |
paul@1236 | 428 | |
paul@1236 | 429 | return parts |
paul@1236 | 430 | |
paul@1031 | 431 | # vim: tabstop=4 expandtab shiftwidth=4 |