diff --git a/docs/index.rst b/docs/index.rst index 2576077..c4db3c2 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -216,7 +216,7 @@ Extract using the rules in a JSON file (from *parslepy*'s ``examples/`` director "url": "a.item @href", }] } - $ python run_parslepy.py --script examples/engadget_css.let.json --url http://www.engadget.com + $ python run_parslepy.py examples/engadget_css.let.json http://www.engadget.com {u'sections': [{u'title': u'News', u'url': '/'}, {u'title': u'Reviews', u'url': '/reviews/'}, {u'title': u'Features', u'url': '/features/'}, @@ -228,6 +228,20 @@ Extract using the rules in a JSON file (from *parslepy*'s ``examples/`` director {u'title': u'Engadget Show', u'url': '/videos/show/'}, {u'title': u'Topics', u'url': '#nav-topics'}]} +You can also pass a local HTML file instead:: + + $ wget http://www.engadget.com -O page.html + $ python run_parslepy.py examples/engadget_css.let.json page.html + {u'sections': [{u'title': u'News', u'url': '/'}, + {u'title': u'Reviews', u'url': '/reviews/'}, + {u'title': u'Features', u'url': '/features/'}, + {u'title': u'Galleries', u'url': '/galleries/'}, + {u'title': u'Videos', u'url': '/videos/'}, + {u'title': u'Events', u'url': '/events/'}, + {u'title': u'Podcasts', + u'url': '/podcasts/the-engadget-podcast/'}, + {u'title': u'Engadget Show', u'url': '/videos/show/'}, + {u'title': u'Topics', u'url': '#nav-topics'}]} You may want to check out the other examples given in the ``examples/`` directory. You can run them using the ``run_parslepy.py`` script like shown above. diff --git a/run_parslepy.py b/run_parslepy.py index 9f55c6e..498e004 100644 --- a/run_parslepy.py +++ b/run_parslepy.py @@ -1,34 +1,24 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -import optparse +import argparse import pprint import parslepy -import lxml.html -def main(): - parser = optparse.OptionParser() - parser.add_option("--debug", dest="debug", action="store_true", help="debug mode", default=False) - parser.add_option("--url", dest="url", help="fetch this URL", default=None) - parser.add_option("--file", dest="inputfile", help="parse this HTML file", default=None) - parser.add_option("--script", dest="parselet", help="Parsley script filename", default=None) - - (options, args) = parser.parse_args() +def main(args): + with open(args.parselet) as fp: + extractor = parslepy.Parselet.from_jsonfile(fp, debug=args.debug) + output = extractor.parse(args.input) + pprint.pprint(output) - if not options.parselet: - print("You must provide a Parsley script") - return - if not options.url and not options.inputfile: - print("You must provide an URL") - return - with open(options.parselet) as fp: - - extractor = parslepy.Parselet.from_jsonfile(fp, options.debug) - output = extractor.parse(options.url or options.inputfile) - pprint.pprint(output) +def parse_args(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument('parselet', help='Path to Parsley script') + parser.add_argument('input', help='URL or HTML file to scrape from') + parser.add_argument('--debug', action='store_true', help='Enable DEBUG mode') + return parser.parse_args() if __name__ == '__main__': - main() - + main(parse_args()) diff --git a/setup.py b/setup.py index a977d92..ef739fa 100644 --- a/setup.py +++ b/setup.py @@ -39,6 +39,7 @@ "lxml >= 2.3", "cssselect", ], + scripts=['run_parslepy.py'], classifiers = [ 'Topic :: Software Development :: Libraries :: Python Modules', 'Topic :: Text Processing :: Markup :: HTML',