Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ Extract using the rules in a JSON file (from *parslepy*'s ``examples/`` director
"url": "a.item @href",
}]
}
$ python run_parslepy.py --script examples/engadget_css.let.json --url http://www.engadget.com
$ python run_parslepy.py examples/engadget_css.let.json http://www.engadget.com
{u'sections': [{u'title': u'News', u'url': '/'},
{u'title': u'Reviews', u'url': '/reviews/'},
{u'title': u'Features', u'url': '/features/'},
Expand All @@ -228,6 +228,20 @@ Extract using the rules in a JSON file (from *parslepy*'s ``examples/`` director
{u'title': u'Engadget Show', u'url': '/videos/show/'},
{u'title': u'Topics', u'url': '#nav-topics'}]}

You can also pass a local HTML file instead::

$ wget http://www.engadget.com -O page.html
$ python run_parslepy.py examples/engadget_css.let.json page.html
{u'sections': [{u'title': u'News', u'url': '/'},
{u'title': u'Reviews', u'url': '/reviews/'},
{u'title': u'Features', u'url': '/features/'},
{u'title': u'Galleries', u'url': '/galleries/'},
{u'title': u'Videos', u'url': '/videos/'},
{u'title': u'Events', u'url': '/events/'},
{u'title': u'Podcasts',
u'url': '/podcasts/the-engadget-podcast/'},
{u'title': u'Engadget Show', u'url': '/videos/show/'},
{u'title': u'Topics', u'url': '#nav-topics'}]}

You may want to check out the other examples given in the ``examples/`` directory.
You can run them using the ``run_parslepy.py`` script like shown above.
Expand Down
36 changes: 13 additions & 23 deletions run_parslepy.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,24 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import optparse
import argparse
import pprint
import parslepy
import lxml.html

def main():

parser = optparse.OptionParser()
parser.add_option("--debug", dest="debug", action="store_true", help="debug mode", default=False)
parser.add_option("--url", dest="url", help="fetch this URL", default=None)
parser.add_option("--file", dest="inputfile", help="parse this HTML file", default=None)
parser.add_option("--script", dest="parselet", help="Parsley script filename", default=None)

(options, args) = parser.parse_args()
def main(args):
with open(args.parselet) as fp:
extractor = parslepy.Parselet.from_jsonfile(fp, debug=args.debug)
output = extractor.parse(args.input)
pprint.pprint(output)

if not options.parselet:
print("You must provide a Parsley script")
return
if not options.url and not options.inputfile:
print("You must provide an URL")
return

with open(options.parselet) as fp:

extractor = parslepy.Parselet.from_jsonfile(fp, options.debug)
output = extractor.parse(options.url or options.inputfile)
pprint.pprint(output)
def parse_args():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('parselet', help='Path to Parsley script')
parser.add_argument('input', help='URL or HTML file to scrape from')
parser.add_argument('--debug', action='store_true', help='Enable DEBUG mode')
return parser.parse_args()

if __name__ == '__main__':
main()

main(parse_args())
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
"lxml >= 2.3",
"cssselect",
],
scripts=['run_parslepy.py'],
classifiers = [
'Topic :: Software Development :: Libraries :: Python Modules',
'Topic :: Text Processing :: Markup :: HTML',
Expand Down