(16 ratings)   
By: David Kuhlman
Python is an excellent language for text analysis.In some cases, simply splitting lines of text into words will be enough. In these cases use string.split(). In other cases, regular expressions may be able to do the parsing you need. If so, see the...
Added: 02 June 2008    Views: 107  
PathComputers    Programming    Python
Keywords: computer   programming   python   code   coder   language   coding   parsing   plex  
Do you like this tutorial? Now you can support our team to add more :     
 
 
 
Browse Pages :   <<    <    1    2    3  

Creating a parser with PLY

In this section we will show how to implement our parser example with PLY.

First down-load PLY. It is available at http://systems.cs.uchicago.edu/ply/.

Then add the PLY directory to your PYTHONPATH.

Learn how to construct lexers and parsers with PLY by reading doc/ply.html in the distribution of PLY and by looking at the examples in the distribution.

For those of you who want a more complex example, see A Python Parser for the RELAX NG Compact Syntax, which is implemented with PLY.

Now, here is our example parser. Comments and explanations are below.

#!/usr/bin/env python

"""

python_201_parser_ply.py



A parser example.

This example uses PLY to implement a lexer and parser.



The grammar:



Prog ::= Command*

Command ::= Func_call

Func_call ::= Term '(' Func_call_list ')'

Func_call_list ::= Func_call*

Term = <word>



"""



import sys, types

import getopt

import lex

import yacc



#

# Globals

#



startlinepos = 0



#

# Constants

#



# AST node types

NoneNodeType = 0

ProgNodeType = 1

CommandNodeType = 2

CommandListNodeType = 3

FuncCallNodeType = 4

FuncCallListNodeType = 5

TermNodeType = 6



# Dictionary to map node type values to node type names

NodeTypeDict = {

NoneNodeType: 'NoneNodeType',

ProgNodeType: 'ProgNodeType',

CommandNodeType: 'CommandNodeType',

CommandListNodeType: 'CommandListNodeType',

FuncCallNodeType: 'FuncCallNodeType',

FuncCallListNodeType: 'FuncCallListNodeType',

TermNodeType: 'TermNodeType',

}



#

# Representation of a node in the AST (abstract syntax tree).

#

class ASTNode:

def __init__(self, nodeType, *args):

self.nodeType = nodeType

self.children = []

for item in args:

self.children.append(item)

def append(self, item):

self.children.append(item)

def show(self, level):

self.showLevel(level)

print 'Node -- Type: %s' % NodeTypeDict[self.nodeType]

level += 1

for child in self.children:

if isinstance(child, ASTNode):

child.show(level)

elif type(child) == types.ListType:

for item in child:

item.show(level)

else:

self.showLevel(level)

print 'Value:', child

def showLevel(self, level):

for idx in range(level):

print ' ',



#

# Exception classes

#

class LexerError(Exception):

def __init__(self, msg, lineno, columnno):

self.msg = msg

self.lineno = lineno

self.columnno = columnno

def show(self):

sys.stderr.write('Lexer error (%d, %d) %s\n' % \

(self.lineno, self.columnno, self.msg))



class ParserError(Exception):

def __init__(self, msg, lineno, columnno):

self.msg = msg

self.lineno = lineno

self.columnno = columnno

def show(self):

sys.stderr.write('Parser error (%d, %d) %s\n' % \

(self.lineno, self.columnno, self.msg))



#

# Lexer specification

#

tokens = (

'NAME',

'LPAR','RPAR',

'COMMA',

)



# Tokens



t_LPAR = r'\('

t_RPAR = r'\)'

t_COMMA = r'\,'

t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*'



# Ignore whitespace

t_ignore = ' \t'



# Ignore comments ('#' to end of line)

def t_COMMENT(t):

r'\#[^\n]*'

pass



def t_newline(t):

r'\n+'

global startlinepos

startlinepos = t.lexer.lexpos - 1

t.lineno += t.value.count("\n")



def t_error(t):

global startlinepos

msg = "Illegal character '%s'" % (t.value[0])

columnno = t.lexer.lexpos - startlinepos

raise LexerError(msg, t.lineno, columnno)



#

# Parser specification

#

def p_prog(t):

'prog : command_list'

t[0] = ASTNode(ProgNodeType, t[1])



def p_command_list_1(t):

'command_list : command'

t[0] = ASTNode(CommandListNodeType, t[1])



def p_command_list_2(t):

'command_list : command_list command'

t[1].append(t[2])

t[0] = t[1]



def p_command(t):

'command : func_call'

t[0] = ASTNode(CommandNodeType, t[1])



def p_func_call_1(t):

'func_call : term LPAR RPAR'

t[0] = ASTNode(FuncCallNodeType, t[1])



def p_func_call_2(t):

'func_call : term LPAR func_call_list RPAR'

t[0] = ASTNode(FuncCallNodeType, t[1], t[3])



def p_func_call_list_1(t):

'func_call_list : func_call'

t[0] = ASTNode(FuncCallListNodeType, t[1])



def p_func_call_list_2(t):

'func_call_list : func_call_list COMMA func_call'

t[1].append(t[3])

t[0] = t[1]



def p_term(t):

'term : NAME'

t[0] = ASTNode(TermNodeType, t[1])



def p_error(t):

global startlinepos

msg = "Syntax error at '%s'" % t.value

columnno = t.lexer.lexpos - startlinepos

raise ParserError(msg, t.lineno, columnno)



#

# Parse the input and display the AST (abstract syntax tree)

#

def parse(infileName):

startlinepos = 0

# Build the lexer

lex.lex(debug=1)

# Build the parser

yacc.yacc()

# Read the input

infile = file(infileName, 'r')

content = infile.read()

infile.close()

try:

# Do the parse

result = yacc.parse(content)

# Display the AST

result.show(0)

except LexerError, exp:

exp.show()

except ParserError, exp:

exp.show()



USAGE_TEXT = """

Usage:

python python_201_parser_ply.py [options] <inputfile>

Options:

-h, --help Display this help message.

Example:

python python_201_parser_ply.py testfile.prog

"""



def usage():

print USAGE_TEXT

sys.exit(-1)



def main():

args = sys.argv[1:]

try:

opts, args = getopt.getopt(args, 'h', ['help'])

except:

usage()

relink = 1

for opt, val in opts:

if opt in ('-h', '--help'):

usage()

if len(args) != 1:

usage()

infileName = args[0]

parse(infileName)



if __name__ == '__main__':

main()

#import pdb

#pdb.run('main()')

Applying this parser to the following input:

# Test for recursive descent parser and Plex.

# Command #1

aaa()

# Command #2

bbb (ccc()) # An end of line comment.

# Command #3

ddd(eee(), fff(ggg(), hhh(), iii()))

# End of test

produces the following output:

Node -- Type: ProgNodeType

Node -- Type: CommandListNodeType

Node -- Type: CommandNodeType

Node -- Type: FuncCallNodeType

Node -- Type: TermNodeType

Value: aaa

Node -- Type: CommandNodeType

Node -- Type: FuncCallNodeType

Node -- Type: TermNodeType

Value: bbb

Node -- Type: FuncCallListNodeType

Node -- Type: FuncCallNodeType

Node -- Type: TermNodeType

Value: ccc

Node -- Type: CommandNodeType

Node -- Type: FuncCallNodeType

Node -- Type: TermNodeType

Value: ddd

Node -- Type: FuncCallListNodeType

Node -- Type: FuncCallNodeType

Node -- Type: TermNodeType

Value: eee

Node -- Type: FuncCallNodeType

Node -- Type: TermNodeType

Value: fff

Node -- Type: FuncCallListNodeType

Node -- Type: FuncCallNodeType

Node -- Type: TermNodeType

Value: ggg

Node -- Type: FuncCallNodeType

Node -- Type: TermNodeType

Value: hhh

Node -- Type: FuncCallNodeType

Node -- Type: TermNodeType

Value: iii

Comments and explanation:

  • Creating the syntax tree -- Basically, each rule (1) recognizes a non-terminal, (2) creates a node (possibly using the values from the right-hand side of the rule), and (3) returns the node by setting the value of t[0]. A deviation from this is the processing of sequences, discussed below.
  • Sequences -- p_command_list_1 and p_command_list_1 show how to handle sequences of items. In this case:
    1. p_command_list_1 recognizes a command and creates an instance of ASTNodeCommandListNodeType and adds the command to it as a child, and with type
    2. p_command_list_2 recognizes an additional command and adds it (as a child) to the instance of ASTNode that represents the list.
  • Distinguishing between different forms of the same rule -- In order to process alternatives to the same production rule differently, we use different functions with different implementations. For example, we use:
    • p_func_call_1 to recognize and process "func_call : term LPAR RPAR" (a function call without arguments), and
    • p_func_call_2 to recognize and process "func_call : term LPAR func_call_list RPAR" (a function call with arguments).
  • Reporting errors -- Our parser reports the first error and quits. We've done this by raising an exception when we find an error. We implement two exception classes: LexerError and ParserError. Implementing more than one exception class enables us to distinguish between different classes of errors (note the multiple except: clauses on the try:parse). And, we use an instance of the exception class as a container in order to "bubble up" information about the error (e.g. a message, a line number, and a column number). statement in function

Creating a parser with pyparsing

pyparsing is a relatively new parsing package for Python. It was implemented and is supported by Paul McGuire and it shows promise. It appears especially easy to use and seems especially appropriate in particular for quick parsing tasks, although it has features that make some complex parsing tasks easy. It follows a very natural Python style for constructing parsers.

Good documentation comes with the pyparsing distribution. See file HowToUseParsing.html. So, I won't try to repeat that here. What follows is an attempt to provide several quick examples to help you solve simple parsing tasks as quickly as possible.

You will also want to look at the samples in the examples directory, which are very helpful. My examples below are fairly simple. You can see more of the ability of pyparsing to handle complex tasks in the examples.

Where to get it - You can find pyparsing at: http://pyparsing.sourceforge.net/.

How to install it - Put the pyparsing module somewhere on your PYTHONPATH.

And now, here are a few examples.

Parsing comma-delimeted lines

Here is a simple grammar for lines containing fields separated by commas:

import sys

from pyparsing import alphanums, ZeroOrMore, Word



fieldDef = Word(alphanums)

lineDef = fieldDef + ZeroOrMore("," + fieldDef)



args = sys.argv[1:]

if len(args) != 1:

print 'usage: python pyparsing_test1.py <datafile.txt>'

sys.exit(-1)

infilename = sys.argv[1]

infile = file(infilename, 'r')

for line in infile:

fields = lineDef.parseString(line)

print fields

Notes and explanation:

  • Note how the grammar is constructed from normal Python calls to function and object/class constructors. I've constructed the parser in-line because my examples are simple, but constructing the parser in a function or even a module might make sense for more complex grammars. pyparsing makes it easy to use these these different styles.
  • Use "+" to specify a sequence. In our example, a lineDef is a fieldDef followed by ....
  • Use ZeroOrMore to specify repetition. In our example, a lineDef is a fieldDef followed by zero or more occurances of comma and fieldDef. There is also OneOrMore when you want to require at least one occurance.
  • Parsing comma delimited text happens so frequently that pyparsing provides a shortcut. Replace:
    lineDef = fieldDef + ZeroOrMore("," + fieldDef)

    with:

    lineDef = delimitedList(fieldDef)

    And note that delimitedList takes an optional argument delim used to specify the delimiter. The default is a comma.

Parsing functors

This example parses expressions of the form ``func(arg1, arg2, arg3)''.

from pyparsing import Word, alphas, alphanums, nums, ZeroOrMore, Literal



lparen = Literal("(")

rparen = Literal(")")

identifier = Word(alphas, alphanums + "_")

integer = Word( nums )

functor = identifier

arg = identifier | integer

args = arg + ZeroOrMore("," + arg)

expression = functor + lparen + args + rparen



content = raw_input("Enter an expression: ")

parsedContent = expression.parseString(content)

print parsedContent

Explanation:

  • Use Literal to specify a fixed string that is to be matched exactly. In our example, a lparen is a ``(``.
  • Word takes an optional second argument. With a single (string) argument, it matches any contiguous word made up of characters in the string. With two (string) arguments it matches a word whose first character is in the first string and whose remaining characters are in the second string. So, our definition of identifier matches a word whose first character is an alpha and whose remaining characters are alpha-numerics or underscore. As another example, you can think of Word("0123456789") as analogous to a regexp containing the pattern "[0-9]+".
  • Use a vertical bar for alternation. In our example, an arg can be either an identifier or an integer.

Parsing names, phone numbers, etc.

This example parses expressions having the following form:

Input format:



[name] [phone] [city, state zip]



Last, first 111-222-3333 city, ca 99999

Here is the parser:

import sys

from pyparsing import alphas, nums, ZeroOrMore, Word, Group, Suppress, Combine



lastname = Word(alphas)

firstname = Word(alphas)

city = Group(Word(alphas) + ZeroOrMore(Word(alphas)))

state = Word(alphas, exact=2)

zip = Word(nums, exact=5)



name = Group(lastname + Suppress(",") + firstname)

phone = Combine(Word(nums, exact=3) + "-" + Word(nums, exact=3) + "-" + Word(nums, exact=4))

location = Group(city + Suppress(",") + state + zip)



record = name + phone + location



args = sys.argv[1:]

if len(args) != 1:

print 'usage: python pyparsing_test3.py <datafile.txt>'

sys.exit(-1)

infilename = sys.argv[1]

infile = file(infilename, 'r')

for line in infile:

line = line.strip()

if line and line[0] != "#":

fields = record.parseString(line)

print fields

And, here is some sample input:

Jabberer, Jerry          111-222-3333   Bakersfield, CA 95111

Kackler, Kerry 111-222-3334 Fresno, CA 95112

Louderdale, Larry 111-222-3335 Los Angeles, CA 94001

Here is output from parsing the above input:

[['Jabberer', 'Jerry'], '111-222-3333', [['Bakersfield'], 'CA', '95111']]

[['Kackler', 'Kerry'], '111-222-3334', [['Fresno'], 'CA', '95112']]

[['Louderdale', 'Larry'], '111-222-3335', [['Los', 'Angeles'], 'CA', '94001']]

Comments:

  • We use the ``len=n'' argument to the Word constructor to restict the parser to accepting a specific number of characters, for example in the zip code and phone number. Word also accepts ``min=n'' and ``max=n'' to enable you to restrict the length of a word to within a range.
  • We use Group to group the parsed results into sub-lists, for example in the definition of city and name. Group enables us to organize the parse results into simple parse trees.
  • We use Combine to join parsed results back into a single string. For example, in the phone number, we can require dashes and yet join the results back into a single string.
  • We use Suppress to remove unneeded sub-elements from parsed results. For example, we do not need the comma between last and first name.

A more complex example

This example (thanks to Paul McGuire) parses a more complex structure and produces a dictionary.

Here is the code:

from pyparsing import Literal, Word, Group, Dict, ZeroOrMore, alphas, nums,\

delimitedList



import pprint



testData = """

+-------+------+------+------+------+------+------+------+------+

| | A1 | B1 | C1 | D1 | A2 | B2 | C2 | D2 |

+=======+======+======+======+======+======+======+======+======+

| min | 7 | 43 | 7 | 15 | 82 | 98 | 1 | 37 |

| max | 11 | 52 | 10 | 17 | 85 | 112 | 4 | 39 |

| ave | 9 | 47 | 8 | 16 | 84 | 106 | 3 | 38 |

| sdev | 1 | 3 | 1 | 1 | 1 | 3 | 1 | 1 |

+-------+------+------+------+------+------+------+------+------+

"""



# Define grammar for datatable

heading = (Literal(

"+-------+------+------+------+------+------+------+------+------+") +

"| | A1 | B1 | C1 | D1 | A2 | B2 | C2 | D2 |" +

"+=======+======+======+======+======+======+======+======+======+").suppress()



vert = Literal("|").suppress()

number = Word(nums)

rowData = Group( vert + Word(alphas) + vert + delimitedList(number,"|") +

vert )

trailing = Literal(

"+-------+------+------+------+------+------+------+------+------+").suppress()



datatable = heading + Dict( ZeroOrMore(rowData) ) + trailing



# Now parse data and print results

data = datatable.parseString(testData)

print "data:", data

print "data.asList():",

pprint.pprint(data.asList())

print "data keys:", data.keys()

print "data['min']:", data['min']

print "data.max:", data.max

Notes:

  • Note the use of Dict to create a dictionary. The print statements show how to get at the items in the dictionary.
  • Note how we can also get the parse results as a list by using method asList.
  • Again, we use suppress to remove unneeded items from the parse results.
Browse Pages :   <<    <    1    2    3  
About the Author :
Parsing in Phyton
Articles and Tutorials Directory by www.learnfobia.com
 Rate this tutorial : Rate 1Rate 2Rate 3Rate 4Rate 5
  |    Add to Favorites
  |    Send to Friend
  |    Print
Comments