Lichen (annotate pyparser/pyparse.py in 1f9bb1de08f5)

from pyparser import parser, pytokenizer, pygram, error

paul@437

2

from pyparser import consts

def recode_to_utf8(bytes, encoding):

paul@537

5

    text = bytes.decode(encoding)

paul@537

6

    if not isinstance(text, unicode):

paul@537

7

        raise error.SyntaxError("codec did not return a unicode object")

paul@537

8

    recoded = text.encode("utf-8")

paul@537

9

    return recoded

def _normalize_encoding(encoding):

paul@437

12

    """returns normalized name for <encoding>

    see dist/src/Parser/tokenizer.c 'get_normal_name()'

paul@437

15

    for implementation details / reference

    NOTE: for now, parser.suite() raises a MemoryError when

paul@437

18

          a bad encoding is used. (SF bug #979739)

paul@437

19

"""

paul@437

20

    if encoding is None:

paul@437

21

        return None

paul@437

22

    # lower() + '_' / '-' conversion

paul@437

23

    encoding = encoding.replace('_', '-').lower()

paul@437

24

    if encoding == 'utf-8' or encoding.startswith('utf-8-'):

paul@437

25

        return 'utf-8'

paul@437

26

    for variant in ['latin-1', 'iso-latin-1', 'iso-8859-1']:

paul@437

27

        if (encoding == variant or

paul@437

28

            encoding.startswith(variant + '-')):

paul@437

29

            return 'iso-8859-1'

paul@437

30

    return encoding

def _check_for_encoding(s):

paul@437

33

    eol = s.find('\n')

paul@437

34

    if eol < 0:

paul@437

35

        return _check_line_for_encoding(s)[0]

paul@437

36

    enc, again = _check_line_for_encoding(s[:eol])

paul@437

37

    if enc or not again:

paul@437

38

        return enc

paul@437

39

    eol2 = s.find('\n', eol + 1)

paul@437

40

    if eol2 < 0:

paul@437

41

        return _check_line_for_encoding(s[eol + 1:])[0]

paul@437

42

    return _check_line_for_encoding(s[eol + 1:eol2])[0]

def _check_line_for_encoding(line):

paul@437

46

    """returns the declared encoding or None"""

paul@437

47

    i = 0

paul@437

48

    for i in range(len(line)):

paul@437

49

        if line[i] == '#':

paul@437

50

            break

paul@437

51

        if line[i] not in ' \t\014':

paul@437

52

            return None, False  # Not a comment, don't read the second line.

paul@437

53

    return pytokenizer.match_encoding_declaration(line[i:]), True

class CompileInfo(object):

paul@437

57

    """Stores information about the source being compiled.

    * filename: The filename of the source.

paul@437

60

    * mode: The parse mode to use. ('exec', 'eval', or 'single')

paul@437

61

    * flags: Parser and compiler flags.

paul@437

62

    * encoding: The source encoding.

paul@437

63

"""

    def __init__(self, filename, mode="exec", flags=0):

paul@437

66

        self.filename = filename

paul@437

67

        self.mode = mode

paul@437

68

        self.encoding = None

paul@437

69

        self.flags = flags

_targets = {

paul@439

73

'eval' : pygram.syms["eval_input"],

paul@439

74

'single' : pygram.syms["single_input"],

paul@439

75

'exec' : pygram.syms["file_input"],

class PythonParser(parser.Parser):

    def __init__(self, grammar=pygram.python_grammar):

paul@437

81

        parser.Parser.__init__(self, grammar)

    def parse_source(self, textsrc, compile_info):

paul@437

84

        """Main entry point for parsing Python source.

        Everything from decoding the source to tokenizing to building the parse

paul@437

87

        tree is handled here.

paul@437

88

"""

paul@437

89

        # Detect source encoding.

paul@437

90

        enc = None

paul@437

91

        if textsrc.startswith("\xEF\xBB\xBF"):

paul@437

92

            textsrc = textsrc[3:]

paul@437

93

            enc = 'utf-8'

paul@437

94

            # If an encoding is explicitly given check that it is utf-8.

paul@437

95

            decl_enc = _check_for_encoding(textsrc)

paul@437

96

            if decl_enc and decl_enc != "utf-8":

paul@437

97

                raise error.SyntaxError("UTF-8 BOM with %s coding cookie" % decl_enc,

paul@437

98

                                        filename=compile_info.filename)

paul@437

99

        elif compile_info.flags & consts.PyCF_SOURCE_IS_UTF8:

paul@437

100

            enc = 'utf-8'

paul@437

101

            if _check_for_encoding(textsrc) is not None:

paul@437

102

                raise error.SyntaxError("coding declaration in unicode string",

paul@437

103

                                        filename=compile_info.filename)

paul@437

104

        else:

paul@437

105

            enc = _normalize_encoding(_check_for_encoding(textsrc))

paul@537

106

            if enc is not None and enc != 'utf-8':

paul@537

107

                try:

paul@537

108

                    textsrc = recode_to_utf8(textsrc, enc)

paul@537

109

                except LookupError as e:

paul@537

110

                    # if the codec is not found, LookupError is raised.

paul@537

111

                    raise error.SyntaxError("Unknown encoding: %s" % enc,

paul@537

112

                                            filename=compile_info.filename)

paul@537

113

                # Transform unicode errors into SyntaxError

paul@537

114

                except UnicodeDecodeError as e:

paul@537

115

                    message = str(e)

paul@537

116

                    raise error.SyntaxError(message)

        flags = compile_info.flags

        # The tokenizer is very picky about how it wants its input.

paul@437

121

        source_lines = textsrc.splitlines(True)

paul@437

122

        if source_lines and not source_lines[-1].endswith("\n"):

paul@437

123

            source_lines[-1] += '\n'

paul@437

124

        if textsrc and textsrc[-1] == "\n":

paul@437

125

            flags &= ~consts.PyCF_DONT_IMPLY_DEDENT

        self.prepare(_targets[compile_info.mode])

paul@437

128

        tp = 0

paul@437

129

        try:

paul@437

130

            try:

paul@437

131

                # Note: we no longer pass the CO_FUTURE_* to the tokenizer,

paul@437

132

                # which is expected to work independently of them.  It's

paul@437

133

                # certainly the case for all futures in Python <= 2.7.

paul@437

134

                tokens = pytokenizer.generate_tokens(source_lines, flags)

                self.grammar = pygram.python_grammar

                for tp, value, lineno, column, line in tokens:

paul@437

139

                    if self.add_token(tp, value, lineno, column, line):

paul@437

140

                        break

paul@437

141

            except error.TokenError as e:

paul@437

142

                e.filename = compile_info.filename

paul@437

143

                raise

paul@437

144

            except parser.ParseError as e:

paul@437

145

                # Catch parse errors, pretty them up and reraise them as a

paul@437

146

                # SyntaxError.

paul@437

147

                new_err = error.IndentationError

paul@439

148

                if tp == pygram.tokens["INDENT"]:

paul@437

149

                    msg = "unexpected indent"

paul@439

150

                elif e.expected == pygram.tokens["INDENT"]:

paul@437

151

                    msg = "expected an indented block"

paul@437

152

                else:

paul@437

153

                    new_err = error.SyntaxError

paul@437

154

                    msg = "invalid syntax"

paul@437

155

                raise new_err(msg, e.lineno, e.column, e.line,

paul@437

156

                              compile_info.filename)

paul@437

157

            else:

paul@437

158

                tree = self.root

paul@437

159

        finally:

paul@437

160

            # Avoid hanging onto the tree.

paul@437

161

            self.root = None

paul@437

162

        if enc is not None:

paul@437

163

            compile_info.encoding = enc

paul@438

164

            # Wrap the tree in a special encoding declaration for parser module

paul@438

165

            # compatibility.

paul@439

166

            tree = parser.NonterminalEnc(pygram.syms["encoding_decl"], tree, enc)

paul@437

167

        return tree

def parse(filename):

paul@437

170

    """returns the parsed contents of <filename>"""

paul@437

171

    info = CompileInfo(filename)

paul@437

172

    f = open(filename)

paul@437

173

    try:

paul@437

174

        return PythonParser().parse_source(f.read(), info)

paul@437

175

    finally:

paul@437

176

        f.close()

def suite(text):

paul@437

179

    """returns the parsed form of the given program <text>"""

paul@437

180

    info = CompileInfo("<stdin>")

paul@437

181

    return PythonParser().parse_source(text, info)

def expr(text):

paul@437

184

    """returns the parsed form of the given expression <text>"""

paul@437

185

    info = CompileInfo("<stdin>", "single")

paul@437

186

    return PythonParser().parse_source(text, info)

def st2tuple(tree, line_info=True, col_info=False):

paul@437

189

    """returns <tree> in tuple form for the compiler package"""

paul@437

190

    if isinstance(tree, parser.AbstractNonterminal):

paul@437

191

        l = [tree.type]

paul@437

192

        for i in range(0, tree.num_children()):

paul@437

193

            l.append(st2tuple(tree.get_child(i)))

paul@438

194

        if isinstance(tree, parser.NonterminalEnc):

paul@438

195

            l.append(tree.encoding)

paul@437

196

        return tuple(l)

paul@437

197

    elif isinstance(tree, parser.Terminal):

paul@437

198

        l = [tree.type, tree.value]

paul@437

199

        if line_info:

paul@437

200

            l.append(tree.get_lineno())

paul@437

201

        if col_info:

paul@437

202

            l.append(tree.get_column())

paul@437

203

        return tuple(l)

paul@437

204

    else:

paul@437

205

        raise TypeError, tree

paul@437	1	from pyparser import parser, pytokenizer, pygram, error
paul@437	2	from pyparser import consts
paul@437	3
paul@537	4	def recode_to_utf8(bytes, encoding):
paul@537	5	text = bytes.decode(encoding)
paul@537	6	if not isinstance(text, unicode):
paul@537	7	raise error.SyntaxError("codec did not return a unicode object")
paul@537	8	recoded = text.encode("utf-8")
paul@537	9	return recoded
paul@537	10
paul@437	11	def _normalize_encoding(encoding):
paul@437	12	"""returns normalized name for <encoding>
paul@437	13
paul@437	14	see dist/src/Parser/tokenizer.c 'get_normal_name()'
paul@437	15	for implementation details / reference
paul@437	16
paul@437	17	NOTE: for now, parser.suite() raises a MemoryError when
paul@437	18	a bad encoding is used. (SF bug #979739)
paul@437	19	"""
paul@437	20	if encoding is None:
paul@437	21	return None
paul@437	22	# lower() + '_' / '-' conversion
paul@437	23	encoding = encoding.replace('_', '-').lower()
paul@437	24	if encoding == 'utf-8' or encoding.startswith('utf-8-'):
paul@437	25	return 'utf-8'
paul@437	26	for variant in ['latin-1', 'iso-latin-1', 'iso-8859-1']:
paul@437	27	if (encoding == variant or
paul@437	28	encoding.startswith(variant + '-')):
paul@437	29	return 'iso-8859-1'
paul@437	30	return encoding
paul@437	31
paul@437	32	def _check_for_encoding(s):
paul@437	33	eol = s.find('\n')
paul@437	34	if eol < 0:
paul@437	35	return _check_line_for_encoding(s)[0]
paul@437	36	enc, again = _check_line_for_encoding(s[:eol])
paul@437	37	if enc or not again:
paul@437	38	return enc
paul@437	39	eol2 = s.find('\n', eol + 1)
paul@437	40	if eol2 < 0:
paul@437	41	return _check_line_for_encoding(s[eol + 1:])[0]
paul@437	42	return _check_line_for_encoding(s[eol + 1:eol2])[0]
paul@437	43
paul@437	44
paul@437	45	def _check_line_for_encoding(line):
paul@437	46	"""returns the declared encoding or None"""
paul@437	47	i = 0
paul@437	48	for i in range(len(line)):
paul@437	49	if line[i] == '#':
paul@437	50	break
paul@437	51	if line[i] not in ' \t\014':
paul@437	52	return None, False # Not a comment, don't read the second line.
paul@437	53	return pytokenizer.match_encoding_declaration(line[i:]), True
paul@437	54
paul@437	55
paul@437	56	class CompileInfo(object):
paul@437	57	"""Stores information about the source being compiled.
paul@437	58
paul@437	59	* filename: The filename of the source.
paul@437	60	* mode: The parse mode to use. ('exec', 'eval', or 'single')
paul@437	61	* flags: Parser and compiler flags.
paul@437	62	* encoding: The source encoding.
paul@437	63	"""
paul@437	64
paul@437	65	def __init__(self, filename, mode="exec", flags=0):
paul@437	66	self.filename = filename
paul@437	67	self.mode = mode
paul@437	68	self.encoding = None
paul@437	69	self.flags = flags
paul@437	70
paul@437	71
paul@437	72	_targets = {
paul@439	73	'eval' : pygram.syms["eval_input"],
paul@439	74	'single' : pygram.syms["single_input"],
paul@439	75	'exec' : pygram.syms["file_input"],
paul@437	76	}
paul@437	77
paul@437	78	class PythonParser(parser.Parser):
paul@437	79
paul@437	80	def __init__(self, grammar=pygram.python_grammar):
paul@437	81	parser.Parser.__init__(self, grammar)
paul@437	82
paul@437	83	def parse_source(self, textsrc, compile_info):
paul@437	84	"""Main entry point for parsing Python source.
paul@437	85
paul@437	86	Everything from decoding the source to tokenizing to building the parse
paul@437	87	tree is handled here.
paul@437	88	"""
paul@437	89	# Detect source encoding.
paul@437	90	enc = None
paul@437	91	if textsrc.startswith("\xEF\xBB\xBF"):
paul@437	92	textsrc = textsrc[3:]
paul@437	93	enc = 'utf-8'
paul@437	94	# If an encoding is explicitly given check that it is utf-8.
paul@437	95	decl_enc = _check_for_encoding(textsrc)
paul@437	96	if decl_enc and decl_enc != "utf-8":
paul@437	97	raise error.SyntaxError("UTF-8 BOM with %s coding cookie" % decl_enc,
paul@437	98	filename=compile_info.filename)
paul@437	99	elif compile_info.flags & consts.PyCF_SOURCE_IS_UTF8:
paul@437	100	enc = 'utf-8'
paul@437	101	if _check_for_encoding(textsrc) is not None:
paul@437	102	raise error.SyntaxError("coding declaration in unicode string",
paul@437	103	filename=compile_info.filename)
paul@437	104	else:
paul@437	105	enc = _normalize_encoding(_check_for_encoding(textsrc))
paul@537	106	if enc is not None and enc != 'utf-8':
paul@537	107	try:
paul@537	108	textsrc = recode_to_utf8(textsrc, enc)
paul@537	109	except LookupError as e:
paul@537	110	# if the codec is not found, LookupError is raised.
paul@537	111	raise error.SyntaxError("Unknown encoding: %s" % enc,
paul@537	112	filename=compile_info.filename)
paul@537	113	# Transform unicode errors into SyntaxError
paul@537	114	except UnicodeDecodeError as e:
paul@537	115	message = str(e)
paul@537	116	raise error.SyntaxError(message)
paul@437	117
paul@437	118	flags = compile_info.flags
paul@437	119
paul@437	120	# The tokenizer is very picky about how it wants its input.
paul@437	121	source_lines = textsrc.splitlines(True)
paul@437	122	if source_lines and not source_lines[-1].endswith("\n"):
paul@437	123	source_lines[-1] += '\n'
paul@437	124	if textsrc and textsrc[-1] == "\n":
paul@437	125	flags &= ~consts.PyCF_DONT_IMPLY_DEDENT
paul@437	126
paul@437	127	self.prepare(_targets[compile_info.mode])
paul@437	128	tp = 0
paul@437	129	try:
paul@437	130	try:
paul@437	131	# Note: we no longer pass the CO_FUTURE_* to the tokenizer,
paul@437	132	# which is expected to work independently of them. It's
paul@437	133	# certainly the case for all futures in Python <= 2.7.
paul@437	134	tokens = pytokenizer.generate_tokens(source_lines, flags)
paul@437	135
paul@437	136	self.grammar = pygram.python_grammar
paul@437	137
paul@437	138	for tp, value, lineno, column, line in tokens:
paul@437	139	if self.add_token(tp, value, lineno, column, line):
paul@437	140	break
paul@437	141	except error.TokenError as e:
paul@437	142	e.filename = compile_info.filename
paul@437	143	raise
paul@437	144	except parser.ParseError as e:
paul@437	145	# Catch parse errors, pretty them up and reraise them as a
paul@437	146	# SyntaxError.
paul@437	147	new_err = error.IndentationError
paul@439	148	if tp == pygram.tokens["INDENT"]:
paul@437	149	msg = "unexpected indent"
paul@439	150	elif e.expected == pygram.tokens["INDENT"]:
paul@437	151	msg = "expected an indented block"
paul@437	152	else:
paul@437	153	new_err = error.SyntaxError
paul@437	154	msg = "invalid syntax"
paul@437	155	raise new_err(msg, e.lineno, e.column, e.line,
paul@437	156	compile_info.filename)
paul@437	157	else:
paul@437	158	tree = self.root
paul@437	159	finally:
paul@437	160	# Avoid hanging onto the tree.
paul@437	161	self.root = None
paul@437	162	if enc is not None:
paul@437	163	compile_info.encoding = enc
paul@438	164	# Wrap the tree in a special encoding declaration for parser module
paul@438	165	# compatibility.
paul@439	166	tree = parser.NonterminalEnc(pygram.syms["encoding_decl"], tree, enc)
paul@437	167	return tree
paul@437	168
paul@437	169	def parse(filename):
paul@437	170	"""returns the parsed contents of <filename>"""
paul@437	171	info = CompileInfo(filename)
paul@437	172	f = open(filename)
paul@437	173	try:
paul@437	174	return PythonParser().parse_source(f.read(), info)
paul@437	175	finally:
paul@437	176	f.close()
paul@437	177
paul@437	178	def suite(text):
paul@437	179	"""returns the parsed form of the given program <text>"""
paul@437	180	info = CompileInfo("<stdin>")
paul@437	181	return PythonParser().parse_source(text, info)
paul@437	182
paul@437	183	def expr(text):
paul@437	184	"""returns the parsed form of the given expression <text>"""
paul@437	185	info = CompileInfo("<stdin>", "single")
paul@437	186	return PythonParser().parse_source(text, info)
paul@437	187
paul@437	188	def st2tuple(tree, line_info=True, col_info=False):
paul@437	189	"""returns <tree> in tuple form for the compiler package"""
paul@437	190	if isinstance(tree, parser.AbstractNonterminal):
paul@437	191	l = [tree.type]
paul@437	192	for i in range(0, tree.num_children()):
paul@437	193	l.append(st2tuple(tree.get_child(i)))
paul@438	194	if isinstance(tree, parser.NonterminalEnc):
paul@438	195	l.append(tree.encoding)
paul@437	196	return tuple(l)
paul@437	197	elif isinstance(tree, parser.Terminal):
paul@437	198	l = [tree.type, tree.value]
paul@437	199	if line_info:
paul@437	200	l.append(tree.get_lineno())
paul@437	201	if col_info:
paul@437	202	l.append(tree.get_column())
paul@437	203	return tuple(l)
paul@437	204	else:
paul@437	205	raise TypeError, tree

Lichen

Annotated pyparser/pyparse.py