1 #!/usr/bin/env python 2 3 """ 4 Parsing of textual content. 5 6 Copyright (C) 2014, 2015, 2016 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 import codecs 23 import re 24 25 # Parsing of lines to obtain functions and arguments. 26 27 line_pattern_str = r"(?:" \ 28 r"(?:'(.*?)')" \ 29 r"|" \ 30 r'(?:"(.*?)")' \ 31 r"|" \ 32 r"([^\s]+)" \ 33 r")+" \ 34 r"(?:\s+|$)" 35 line_pattern = re.compile(line_pattern_str) 36 37 def parse_line(text): 38 39 """ 40 Parse the given 'text', returning a list of words separated by whitespace in 41 the input, where whitespace may occur inside words if quoted using single or 42 double quotes. 43 """ 44 45 parts = [] 46 47 # Match the components of each part. 48 49 for match in line_pattern.finditer(text): 50 51 # Combine the components by traversing the matching groups. 52 53 parts.append(reduce(lambda a, b: (a or "") + (b or ""), match.groups())) 54 55 return parts 56 57 # Parsing of tabular files. 58 59 def set_defaults(t, empty_defaults): 60 61 """ 62 In the list 't', replace values that are empty or absent with defaults 63 provided by the 'empty_defaults' collection whose entries are of the form 64 (index, value). 65 """ 66 67 for i, default in empty_defaults: 68 if i >= len(t): 69 t += [None] * (i - len(t) + 1) 70 if not t[i]: 71 t[i] = default 72 return t 73 74 def get_table(filename, empty_defaults=None, tab_separated=True): 75 76 """ 77 From the file having the given 'filename', return a list of tuples 78 representing the file's contents. 79 80 The 'empty_defaults' is a list of (index, value) tuples indicating the 81 default value where a column either does not exist or provides an empty 82 value. 83 84 If 'tab_separated' is specified and is a false value, line parsing using 85 the imiptools.text.parse_line function will be performed instead of 86 splitting each line of the file using tab characters as separators. 87 """ 88 89 f = codecs.open(filename, "rb", encoding="utf-8") 90 try: 91 return get_table_from_stream(f, empty_defaults, tab_separated) 92 finally: 93 f.close() 94 95 def get_table_from_stream(f, empty_defaults=None, tab_separated=True): 96 97 """ 98 Return a list of tuples representing the contents of the stream 'f'. 99 100 The 'empty_defaults' is a list of (index, value) tuples indicating the 101 default value where a column either does not exist or provides an empty 102 value. 103 104 If 'tab_separated' is specified and is a false value, line parsing using 105 the imiptools.text.parse_line function will be performed instead of 106 splitting each line of the file using tab characters as separators. 107 """ 108 109 l = [] 110 111 for line in f.readlines(): 112 line = line.strip(" \r\n") 113 114 if tab_separated: 115 t = line.split("\t") 116 else: 117 t = parse_line(line) 118 119 if empty_defaults: 120 t = set_defaults(t, empty_defaults) 121 l.append(tuple(t)) 122 123 return l 124 125 # vim: tabstop=4 expandtab shiftwidth=4