1 #!/usr/bin/env python 2 3 """ 4 Parsing of textual content. 5 6 Copyright (C) 2014, 2015, 2016, 2017 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 import codecs 23 import re 24 25 # Parsing of lines to obtain functions and arguments. 26 27 line_pattern_str = ( 28 r"(?:" 29 r"(?:'(.*?)')" # single-quoted text 30 r"|" 31 r'(?:"(.*?)")' # double-quoted text 32 r"|" 33 r"([^\s]+)" # non-whitespace characters 34 r")+" 35 r"(?:\s+|$)" # optional trailing whitespace before line end 36 ) 37 38 line_pattern = re.compile(line_pattern_str) 39 40 def parse_line(text): 41 42 """ 43 Parse the given 'text', returning a list of words separated by whitespace in 44 the input, where whitespace may occur inside words if quoted using single or 45 double quotes. 46 47 Hello world -> ['Hello', 'world'] 48 Hello ' world' -> ['Hello', ' world'] 49 Hello' 'world -> ["'Hello'", "'world'] 50 """ 51 52 parts = [] 53 54 # Match the components of each part. 55 56 for match in line_pattern.finditer(text): 57 58 # Combine the components by traversing the matching groups. 59 60 parts.append(reduce(lambda a, b: (a or "") + (b or ""), match.groups())) 61 62 return parts 63 64 # Parsing of tabular files. 65 66 def set_defaults(t, empty_defaults): 67 68 """ 69 In the list 't', replace values that are empty or absent with defaults 70 provided by the 'empty_defaults' collection whose entries are of the form 71 (index, value). 72 """ 73 74 for i, default in empty_defaults: 75 if i >= len(t): 76 t += [None] * (i - len(t) + 1) 77 if not t[i]: 78 t[i] = default 79 return t 80 81 def get_table(filename, empty_defaults=None, tab_separated=True): 82 83 """ 84 From the file having the given 'filename', return a list of tuples 85 representing the file's contents. 86 87 The 'empty_defaults' is a list of (index, value) tuples indicating the 88 default value where a column either does not exist or provides an empty 89 value. 90 91 If 'tab_separated' is specified and is a false value, line parsing using 92 the imiptools.text.parse_line function will be performed instead of 93 splitting each line of the file using tab characters as separators. 94 """ 95 96 f = codecs.open(filename, "rb", encoding="utf-8") 97 try: 98 return get_table_from_stream(f, empty_defaults, tab_separated) 99 finally: 100 f.close() 101 102 def get_table_from_stream(f, empty_defaults=None, tab_separated=True): 103 104 """ 105 Return a list of tuples representing the contents of the stream 'f'. 106 107 The 'empty_defaults' is a list of (index, value) tuples indicating the 108 default value where a column either does not exist or provides an empty 109 value. 110 111 If 'tab_separated' is specified and is a false value, line parsing using 112 the imiptools.text.parse_line function will be performed instead of 113 splitting each line of the file using tab characters as separators. 114 """ 115 116 l = [] 117 118 for line in f.readlines(): 119 line = line.strip(" \r\n") 120 121 if tab_separated: 122 t = line.split("\t") 123 else: 124 t = parse_line(line) 125 126 if empty_defaults: 127 t = set_defaults(t, empty_defaults) 128 l.append(tuple(t)) 129 130 return l 131 132 # vim: tabstop=4 expandtab shiftwidth=4