diff --git a/dumpgenerator.py b/dumpgenerator.py index 36c2a0d9..bb78d250 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python2 +#!/usr/bin/env python # -*- coding: utf-8 -*- # dumpgenerator.py A generator of dumps for wikis @@ -19,18 +19,21 @@ # To learn more, read the documentation: # https://github.com/WikiTeam/wikiteam/wiki +from __future__ import print_function try: from kitchen.text.converters import getwriter, to_unicode except ImportError: - print "Please install the kitchen module." -import cookielib -import cPickle + print ("Please install the kitchen module.") +try: + import cPickle as pickle +except ImportError: + import pickle import datetime import sys try: import argparse except ImportError: - print "Please install the argparse module." + print ("Please install the argparse module.") sys.exit(1) import json try: @@ -40,28 +43,38 @@ import os import re import subprocess +import sys try: import requests except ImportError: - print "Please install or update the Requests module." + print ("Please install or update the Requests module.") sys.exit(1) try: import wikitools except ImportError: - print "Please install the wikitools 1.3+ module if you want to use --xmlrevisions." + print ("Please install the wikitools 1.3+ module if you want to use --xmlrevisions.") try: from lxml import etree from lxml.builder import E except ImportError: - print "Please install the lxml module if you want to use --xmlrevisions." + print ("Please install the lxml module if you want to use --xmlrevisions.") import time import urllib try: from urlparse import urlparse, urlunparse except ImportError: from urllib.parse import urlparse, urlunparse -UTF8Writer = getwriter('utf8') -sys.stdout = UTF8Writer(sys.stdout) +try: + import http.cookiejar as CookieJar +except ImportError: + import cookielib as CookieJar +if sys.version_info < (3, 0): + UTF8Writer = getwriter('utf8') + sys.stdout = UTF8Writer(sys.stdout) + input = raw_input + from urllib import unquote +else: + from urllib.parse import unquote __VERSION__ = '0.4.0-alpha' # major, minor, micro: semver.org @@ -91,7 +104,7 @@ def truncateFilename(other={}, filename=''): def delay(config={}, session=None): """ Add a delay if configured for that """ if config['delay'] > 0: - print 'Sleeping... %d seconds...' % (config['delay']) + print ('Sleeping... %d seconds...' % (config['delay'])) time.sleep(config['delay']) @@ -118,8 +131,8 @@ def cleanHTML(raw=''): raw = raw.split('')[0] else: - print raw[:250] - print 'This wiki doesn\'t use marks to split content' + print (raw[:250]) + print ('This wiki doesn\'t use marks to split content') sys.exit() return raw @@ -129,31 +142,31 @@ def handleStatusCode(response): if statuscode >= 200 and statuscode < 300: return - print "HTTP Error %d." % statuscode + print ("HTTP Error %d." % statuscode) if statuscode >= 300 and statuscode < 400: - print "Redirect should happen automatically: please report this as a bug." - print response.url + print ("Redirect should happen automatically: please report this as a bug.") + print (response.url) elif statuscode == 400: - print "Bad Request: The wiki may be malfunctioning." - print "Please try again later." - print response.url + print ("Bad Request: The wiki may be malfunctioning.") + print ("Please try again later.") + print (response.url) sys.exit(1) elif statuscode == 401 or statuscode == 403: - print "Authentication required." - print "Please use --userpass." - print response.url + print ("Authentication required.") + print ("Please use --userpass.") + print (response.url) elif statuscode == 404: - print "Not found. Is Special:Export enabled for this wiki?" - print response.url + print ("Not found. Is Special:Export enabled for this wiki?") + print (response.url) sys.exit(1) elif statuscode == 429 or (statuscode >= 500 and statuscode < 600): - print "Server error, max retries exceeded." - print "Please resume the dump later." - print response.url + print ("Server error, max retries exceeded.") + print ("Please resume the dump later.") + print (response.url) sys.exit(1) @@ -190,7 +203,7 @@ def getNamespacesScraper(config={}, session=None): namespaces = [0] namespaces = list(set(namespaces)) # uniques - print '%d namespaces found' % (len(namespaces)) + print ('%d namespaces found' % (len(namespaces))) return namespaces, namespacenames @@ -213,9 +226,9 @@ def getNamespacesAPI(config={}, session=None): try: nsquery = result['query']['namespaces'] except KeyError: - print "Error: could not get namespaces from the API request" - print "HTTP %d" % r.status_code - print r.text + print ("Error: could not get namespaces from the API request") + print ("HTTP %d" % r.status_code) + print (r.text) return None if 'all' in namespaces: @@ -241,7 +254,7 @@ def getNamespacesAPI(config={}, session=None): namespaces = [0] namespaces = list(set(namespaces)) # uniques - print '%d namespaces found' % (len(namespaces)) + print ('%d namespaces found' % (len(namespaces))) return namespaces, namespacenames @@ -252,11 +265,11 @@ def getPageTitlesAPI(config={}, session=None): config=config, session=session) for namespace in namespaces: if namespace in config['exnamespaces']: - print ' Skipping namespace = %d' % (namespace) + print (' Skipping namespace = %d' % (namespace)) continue c = 0 - print ' Retrieving titles in the namespace %d' % (namespace) + print (' Retrieving titles in the namespace %d' % (namespace)) apfrom = '!' while apfrom: sys.stderr.write('.') # progress @@ -274,7 +287,7 @@ def getPageTitlesAPI(config={}, session=None): r = session.post(url=config['api'], data=params, timeout=30) break except ConnectionError as err: - print "Connection error: %s" % (str(err),) + print ("Connection error: %s" % (str(err),)) retryCount += 1 time.sleep(20) handleStatusCode(r) @@ -294,12 +307,12 @@ def getPageTitlesAPI(config={}, session=None): elif 'apfrom' in jsontitles['continue']: apfrom = jsontitles['continue']['apfrom'] - # print apfrom - # print jsontitles + # print (apfrom) + # print (jsontitles) try: allpages = jsontitles['query']['allpages'] except KeyError: - print "The allpages API returned nothing. Exit." + print ("The allpages API returned nothing. Exit.") sys.exit(1) # Hack for old versions of MediaWiki API where result is dict @@ -312,13 +325,13 @@ def getPageTitlesAPI(config={}, session=None): c += len(allpages) if len(titles) != len(set(titles)): - print 'Probably a loop, switching to next namespace. Duplicate title:' - print title + print ('Probably a loop, switching to next namespace. Duplicate title:') + print (title) titles = list(set(titles)) apfrom = '' delay(config=config, session=session) - print ' %d titles retrieved in the namespace %d' % (c, namespace) + print (' %d titles retrieved in the namespace %d' % (c, namespace)) def getPageTitlesScraper(config={}, session=None): """ Scrape the list of page titles from Special:Allpages """ @@ -326,7 +339,7 @@ def getPageTitlesScraper(config={}, session=None): namespaces, namespacenames = getNamespacesScraper( config=config, session=session) for namespace in namespaces: - print ' Retrieving titles in the namespace', namespace + print (' Retrieving titles in the namespace', namespace) url = '%s?title=Special:Allpages&namespace=%s' % ( config['index'], namespace) r = session.get(url=url, timeout=30) @@ -385,9 +398,9 @@ def getPageTitlesScraper(config={}, session=None): raw2 = r2.text raw2 = cleanHTML(raw2) rawacum += raw2 # merge it after removed junk - print ' Reading', name, len(raw2), 'bytes', \ + print (' Reading', name, len(raw2), 'bytes', \ len(re.findall(r_suballpages, raw2)), 'subpages', \ - len(re.findall(r_title, raw2)), 'pages' + len(re.findall(r_title, raw2)), 'pages') delay(config=config, session=session) c += 1 @@ -400,7 +413,7 @@ def getPageTitlesScraper(config={}, session=None): if t not in titles: titles.append(t) c += 1 - print ' %d titles retrieved in the namespace %d' % (c, namespace) + print (' %d titles retrieved in the namespace %d' % (c, namespace)) return titles @@ -409,40 +422,40 @@ def getPageTitles(config={}, session=None): # http://en.wikipedia.org/wiki/Special:AllPages # http://archiveteam.org/index.php?title=Special:AllPages # http://www.wikanda.es/wiki/Especial:Todas - print 'Loading page titles from namespaces = %s' % (config['namespaces'] and ','.join([str(i) for i in config['namespaces']]) or 'None') - print 'Excluding titles from namespaces = %s' % (config['exnamespaces'] and ','.join([str(i) for i in config['exnamespaces']]) or 'None') + print ('Loading page titles from namespaces = %s' % (config['namespaces'] and ','.join([str(i) for i in config['namespaces']]) or 'None')) + print ('Excluding titles from namespaces = %s' % (config['exnamespaces'] and ','.join([str(i) for i in config['exnamespaces']]) or 'None')) titles = [] if 'api' in config and config['api']: try: titles = getPageTitlesAPI(config=config, session=session) except: - print "Error: could not get page titles from the API" + print ("Error: could not get page titles from the API") titles = getPageTitlesScraper(config=config, session=session) elif 'index' in config and config['index']: titles = getPageTitlesScraper(config=config, session=session) titlesfilename = '%s-%s-titles.txt' % ( domain2prefix(config=config), config['date']) - titlesfile = open('%s/%s' % (config['path'], titlesfilename), 'wt') + titlesfile = open('%s/%s' % (config['path'], titlesfilename), 'wb') c = 0 for title in titles: - titlesfile.write(title.encode('utf-8') + "\n") + titlesfile.write(title.encode('utf-8') + b"\n") c += 1 # TODO: Sort to remove dupes? In CZ, Widget:AddThis appears two times: # main namespace and widget namespace. # We can use sort -u in UNIX, but is it worth it? - titlesfile.write(u'--END--\n') + titlesfile.write(b'--END--\n') titlesfile.close() - print 'Titles saved at...', titlesfilename + print ('Titles saved at...', titlesfilename) - print '%d page titles loaded' % (c) + print ('%d page titles loaded' % (c)) return titlesfilename def getImageNames(config={}, session=None): """ Get list of image names """ - print 'Retrieving image filenames' + print ('Retrieving image filenames') images = [] if 'api' in config and config['api']: images = getImageNamesAPI(config=config, session=session) @@ -452,7 +465,7 @@ def getImageNames(config={}, session=None): # images = list(set(images)) # it is a list of lists images.sort() - print '%d image names loaded' % (len(images)) + print ('%d image names loaded' % (len(images))) return images @@ -462,11 +475,11 @@ def getXMLHeader(config={}, session=None): # similar to: 0 and c < maxretries: wait = increment * c < maxseconds and increment * \ c or maxseconds # incremental until maxseconds - print ' In attempt %d, XML for "%s" is wrong. Waiting %d seconds and reloading...' %(c, params['pages'], wait) + print (' In attempt %d, XML for "%s" is wrong. Waiting %d seconds and reloading...' %(c, params['pages'], wait)) time.sleep(wait) # reducing server load requesting smallest chunks (if curonly then # limit = 1 from mother function) if params['limit'] > 1: params['limit'] = params['limit'] / 2 # half if c >= maxretries: - print ' We have retried %d times' % (c) - print ' MediaWiki error for "%s", network error or whatever...' % (params['pages']) + print (' We have retried %d times' % (c)) + print (' MediaWiki error for "%s", network error or whatever...' % (params['pages'])) if config['failfast']: - print "Exit, it will be for another time" + print ("Exit, it will be for another time") sys.exit() # If it's not already what we tried: our last chance, preserve only the last revision... # config['curonly'] means that the whole dump is configured to save only the last, @@ -577,7 +590,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None): # fallback, because it's set by the following if and passed to # getXMLPageCore if not config['curonly'] and not 'curonly' in params: - print ' Trying to save only the last revision for this page...' + print (' Trying to save only the last revision for this page...') params['curonly'] = 1 logerror( config=config, @@ -591,7 +604,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None): session=session ) else: - print ' Saving in the errors log, and skipping...' + print (' Saving in the errors log, and skipping...') logerror( config=config, text=u'Error while retrieving the last revision of "%s". Skipping.' % @@ -604,7 +617,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None): handleStatusCode(r) xml = fixBOM(r) except requests.exceptions.ConnectionError as e: - print ' Connection error: %s'%(str(e[0])) + print (' Connection error: %s'%(str(e[0]))) xml = '' c += 1 @@ -667,7 +680,7 @@ def getXMLPage(config={}, title='', verbose=True, session=None): xml2 = getXMLPageCore( params=params, config=config, session=session) except MemoryError: - print "The page's history exceeds our memory, halving limit." + print ("The page's history exceeds our memory, halving limit.") params['limit'] = params['limit'] / 2 continue @@ -677,7 +690,7 @@ def getXMLPage(config={}, title='', verbose=True, session=None): # again the same XML, this wiki does not support params in # Special:Export, offer complete XML up to X edits (usually # 1000) - print 'ATTENTION: This wiki does not allow some parameters in Special:Export, therefore pages with large histories may be truncated' + print ('ATTENTION: This wiki does not allow some parameters in Special:Export, therefore pages with large histories may be truncated') truncated = True break else: @@ -698,7 +711,7 @@ def getXMLPage(config={}, title='', verbose=True, session=None): xml2 = xml2.split("")[0] yield ' ' + (''.join(xml2.split('')[1:])) except MemoryError: - print "The page's history exceeds our memory, halving limit." + print ("The page's history exceeds our memory, halving limit.") params['limit'] = params['limit'] / 2 continue xml = xml2 @@ -709,9 +722,9 @@ def getXMLPage(config={}, title='', verbose=True, session=None): if verbose: if (numberofedits == 1): - print ' %s, 1 edit' % (title.strip()) + print (' %s, 1 edit' % (title.strip())) else: - print ' %s, %d edits' % (title.strip(), numberofedits) + print (' %s, %d edits' % (title.strip(), numberofedits)) def cleanXML(xml=''): @@ -729,7 +742,7 @@ def generateXMLDump(config={}, titles=[], start=None, session=None): # TODO: titles is now unused. header, config = getXMLHeader(config=config, session=session) - footer = '\n' # new line at the end + footer = b'\n' # new line at the end xmlfilename = '%s-%s-%s.xml' % (domain2prefix(config=config), config['date'], config['curonly'] and 'current' or 'history') @@ -737,7 +750,7 @@ def generateXMLDump(config={}, titles=[], start=None, session=None): lock = True if config['xmlrevisions']: - print 'Retrieving the XML for every page from the beginning' + print ('Retrieving the XML for every page from the beginning') xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w') xmlfile.write(header.encode('utf-8')) try: @@ -745,26 +758,26 @@ def generateXMLDump(config={}, titles=[], start=None, session=None): for xml in getXMLRevisions(config=config, session=session): numrevs = len(re.findall(r_timestamp, xml)) # Due to how generators work, it's expected this may be less - print "%d more revisions exported" % numrevs + print ("%d more revisions exported" % numrevs) xml = cleanXML(xml=xml) xmlfile.write(xml.encode('utf-8')) except AttributeError: - print "This wikitools module version is not working" + print ("This wikitools module version is not working") sys.exit() else: - print 'Retrieving the XML for every page from "%s"' % (start and start or 'start') + print ('Retrieving the XML for every page from "%s"' % (start and start or 'start')) if start: - print "Removing the last chunk of past XML dump: it is probably incomplete." + print ("Removing the last chunk of past XML dump: it is probably incomplete.") for i in reverse_readline('%s/%s' % (config['path'], xmlfilename), truncate=True): pass else: # requested complete xml dump lock = False - xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w') + xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'wb') xmlfile.write(header.encode('utf-8')) xmlfile.close() - xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a') + xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'ab') c = 1 for title in readTitles(config, start): if not title.strip(): @@ -775,7 +788,7 @@ def generateXMLDump(config={}, titles=[], start=None, session=None): continue delay(config=config, session=session) if c % 10 == 0: - print 'Downloaded %d pages' % (c) + print ('Downloaded %d pages' % (c)) try: for xml in getXMLPage(config=config, title=title, session=session): xml = cleanXML(xml=xml) @@ -794,7 +807,7 @@ def generateXMLDump(config={}, titles=[], start=None, session=None): xmlfile.write(footer) xmlfile.close() - print 'XML dump saved at...', xmlfilename + print ('XML dump saved at...', xmlfilename) def getXMLRevisions(config={}, session=None, allpages=False): site = wikitools.wiki.Wiki(config['api']) @@ -805,7 +818,7 @@ def getXMLRevisions(config={}, session=None, allpages=False): try: for namespace in namespaces: - print "Trying to export all revisions from namespace %s" % namespace + print ("Trying to export all revisions from namespace %s" % namespace) arvparams = { 'action': 'query', 'list': 'allrevisions', @@ -832,7 +845,7 @@ def getXMLRevisions(config={}, session=None, allpages=False): for page in result['query']['allrevisions']: for revision in page['revisions']: revids.append(str(revision['revid'])) - print "%d more revisions listed, until %s" % (len(revids), revids[-1]) + print ("%d more revisions listed, until %s" % (len(revids), revids[-1])) exportparams = { 'action': 'query', @@ -845,7 +858,7 @@ def getXMLRevisions(config={}, session=None, allpages=False): yield exportresult['query']['export']['*'] except KeyError: - print "Warning. Could not use allrevisions, wiki too old." + print ("Warning. Could not use allrevisions, wiki too old.") if config['curonly']: for title in readTitles(config): exportparams = { @@ -885,7 +898,7 @@ def getXMLRevisions(config={}, session=None, allpages=False): yield xml except wikitools.api.APIError: - print "This wikitools version seems not to work for us. Exiting." + print ("This wikitools version seems not to work for us. Exiting.") sys.exit() def makeXmlFromPage(page): @@ -989,11 +1002,11 @@ def saveImageNames(config={}, images=[], session=None): imagesfilename = '%s-%s-images.txt' % ( domain2prefix(config=config), config['date']) - imagesfile = open('%s/%s' % (config['path'], imagesfilename), 'w') + imagesfile = open('%s/%s' % (config['path'], imagesfilename), 'wb') imagesfile.write( - ('\n'.join( + (u'\n'.join( [ - '%s\t%s\t%s' % + u'%s\t%s\t%s' % (filename, url, uploader) for filename, @@ -1002,10 +1015,10 @@ def saveImageNames(config={}, images=[], session=None): ).encode('utf-8') ) ) - imagesfile.write('\n--END--') + imagesfile.write(b'\n--END--') imagesfile.close() - print 'Image filenames and URLs saved at...', imagesfilename + print ('Image filenames and URLs saved at...', imagesfilename) def curateImageURL(config={}, url=''): @@ -1019,7 +1032,7 @@ def curateImageURL(config={}, url=''): domainalone = config['api'].split( '://')[0] + '://' + config['api'].split('://')[1].split('/')[0] else: - print 'ERROR: no index nor API' + print ('ERROR: no index nor API') sys.exit() if url.startswith('//'): # Orain wikifarm returns URLs starting with // @@ -1031,7 +1044,7 @@ def curateImageURL(config={}, url=''): # concat http(s) + domain + relative url url = u'%s/%s' % (domainalone, url) url = undoHTMLEntities(text=url) - # url = urllib.unquote(url) #do not use unquote with url, it break some + # url = unquote(url) #do not use unquote with url, it break some # urls with odd chars url = re.sub(' ', '_', url) @@ -1062,18 +1075,18 @@ def getImageNamesScraper(config={}, session=None): delay(config=config, session=session) # delicate wiki if re.search( - ur'(?i)(allowed memory size of \d+ bytes exhausted|Call to a member function getURL)', + u'(?i)(allowed memory size of \\d+ bytes exhausted|Call to a member function getURL)', raw): if limit > 10: - print 'Error: listing %d images in a chunk is not possible, trying tiny chunks' % (limit) + print ('Error: listing %d images in a chunk is not possible, trying tiny chunks' % (limit)) limit = limit / 10 continue elif retries > 0: # waste retries, then exit retries -= 1 - print 'Retrying...' + print ('Retrying...') continue else: - print 'No more retries, exit...' + print ('No more retries, exit...') break raw = cleanHTML(raw) @@ -1116,12 +1129,12 @@ def getImageNamesScraper(config={}, session=None): url = curateImageURL(config=config, url=url) filename = re.sub('_', ' ', i.group('filename')) filename = undoHTMLEntities(text=filename) - filename = urllib.unquote(filename) + filename = unquote(filename) uploader = re.sub('_', ' ', i.group('uploader')) uploader = undoHTMLEntities(text=uploader) - uploader = urllib.unquote(uploader) + uploader = unquote(uploader) images.append([filename, url, uploader]) - # print filename, url + # print (filename, url) if re.search(r_next, raw): new_offset = re.findall(r_next, raw)[0] @@ -1135,9 +1148,9 @@ def getImageNamesScraper(config={}, session=None): offset = '' if (len(images) == 1): - print ' Found 1 image' + print (' Found 1 image') else: - print ' Found %d images' % (len(images)) + print (' Found %d images' % (len(images))) images.sort() return images @@ -1178,18 +1191,24 @@ def getImageNamesAPI(config={}, session=None): aifrom = jsonimages['continue']['aicontinue'] elif 'aifrom' in jsonimages['continue']: aifrom = jsonimages['continue']['aifrom'] - # print aifrom + # print (aifrom) for image in jsonimages['query']['allimages']: url = image['url'] url = curateImageURL(config=config, url=url) - # encoding to ascii is needed to work around this horrible bug: - # http://bugs.python.org/issue8136 if 'api' in config and '.wikia.com' in config['api']: #to avoid latest?cb=20120816112532 in filenames - filename = unicode(urllib.unquote((re.sub('_', ' ', url.split('/')[-3])).encode('ascii', 'ignore')), 'utf-8') + filename = re.sub(u'_', u' ', url.split('/')[-3]) + else: + filename = re.sub(u'_', u' ', url.split('/')[-1]) + + if sys.version_info < (3, 0): + # encoding to ascii is needed to work around this horrible bug: + # http://bugs.python.org/issue8136 + filename = unquote(filename.encode('ascii', 'ignore')).decode('utf-8') else: - filename = unicode(urllib.unquote((re.sub('_', ' ', url.split('/')[-1])).encode('ascii', 'ignore')), 'utf-8') + filename = unquote(filename) + uploader = re.sub('_', ' ', image['user']) images.append([filename, url, uploader]) else: @@ -1228,8 +1247,8 @@ def getImageNamesAPI(config={}, session=None): if 'gapfrom' in jsonimages['query-continue']['allpages']: gapfrom = jsonimages[ 'query-continue']['allpages']['gapfrom'] - # print gapfrom - # print jsonimages['query'] + # print (gapfrom) + # print (jsonimages['query']) for image, props in jsonimages['query']['pages'].items(): url = props['imageinfo'][0]['url'] @@ -1245,9 +1264,9 @@ def getImageNamesAPI(config={}, session=None): break if (len(images) == 1): - print ' Found 1 image' + print (' Found 1 image') else: - print ' Found %d images' % (len(images)) + print (' Found %d images' % (len(images))) return images @@ -1270,10 +1289,10 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None): """ Save files and descriptions using a file list """ # fix use subdirectories md5 - print 'Retrieving images from "%s"' % (start and start or 'start') + print ('Retrieving images from "%s"' % (start and start or 'start')) imagepath = '%s/images' % (config['path']) if not os.path.isdir(imagepath): - print 'Creating "%s" directory' % (imagepath) + print ('Creating "%s" directory' % (imagepath)) os.makedirs(imagepath) c = 0 @@ -1290,11 +1309,11 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None): # saving file # truncate filename if length > 100 (100 + 32 (md5) = 132 < 143 (crash # limit). Later .desc is added to filename, so better 100 as max) - filename2 = urllib.unquote(filename) + filename2 = unquote(filename) if len(filename2) > other['filenamelimit']: # split last . (extension) and then merge filename2 = truncateFilename(other=other, filename=filename2) - print 'Filename is too long, truncating. Now it is:', filename2 + print ('Filename is too long, truncating. Now it is:', filename2) filename3 = u'%s/%s' % (imagepath, filename2) imagefile = open(filename3, 'wb') r = requests.get(url=url) @@ -1318,9 +1337,9 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None): text=u'The page "%s" was missing in the wiki (probably deleted)' % (title.decode('utf-8')) ) - f = open('%s/%s.desc' % (imagepath, filename2), 'w') + f = open('%s/%s.desc' % (imagepath, filename2), 'wb') # Banner featuring SG1, SGA, SGU teams - if not re.search(r'', xmlfiledesc): + if not re.search('', xmlfiledesc): # failure when retrieving desc? then save it as empty .desc xmlfiledesc = '' f.write(xmlfiledesc.encode('utf-8')) @@ -1328,9 +1347,9 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None): delay(config=config, session=session) c += 1 if c % 10 == 0: - print ' Downloaded %d images' % (c) + print (' Downloaded %d images' % (c)) - print 'Downloaded %d images' % (c) + print ('Downloaded %d images' % (c)) def saveLogs(config={}, session=None): @@ -1378,9 +1397,9 @@ def loadConfig(config={}, configfilename=''): try: with open('%s/%s' % (config['path'], configfilename), 'r') as infile: - config = cPickle.load(infile) + config = pickle.load(infile) except: - print 'There is no config file. we can\'t resume. Start a new dump.' + print ('There is no config file. we can\'t resume. Start a new dump.') sys.exit() return config @@ -1389,8 +1408,8 @@ def loadConfig(config={}, configfilename=''): def saveConfig(config={}, configfilename=''): """ Save config file """ - with open('%s/%s' % (config['path'], configfilename), 'w') as outfile: - cPickle.dump(config, outfile) + with open('%s/%s' % (config['path'], configfilename), 'wb') as outfile: + pickle.dump(config, outfile) def welcome(): @@ -1431,10 +1450,10 @@ def welcome(): def bye(): """ Closing message """ - print "---> Congratulations! Your dump is complete <---" - print "If you found any bug, report a new issue here: https://github.com/WikiTeam/wikiteam/issues" - print "If this is a public wiki, please, consider publishing this dump. Do it yourself as explained in https://github.com/WikiTeam/wikiteam/wiki/Tutorial#Publishing_the_dump or contact us at https://github.com/WikiTeam/wikiteam" - print "Good luck! Bye!" + print ("---> Congratulations! Your dump is complete <---") + print ("If you found any bug, report a new issue here: https://github.com/WikiTeam/wikiteam/issues") + print ("If this is a public wiki, please, consider publishing this dump. Do it yourself as explained in https://github.com/WikiTeam/wikiteam/wiki/Tutorial#Publishing_the_dump or contact us at https://github.com/WikiTeam/wikiteam") + print ("Good luck! Bye!") def getParameters(params=[]): @@ -1523,33 +1542,33 @@ def getParameters(params=[]): help="Avoid resuming, discard failing wikis quickly. Useful only for mass downloads.") args = parser.parse_args() - # print args + # print (args) # Don't mix download params and meta info params if (args.xml or args.images) and \ (args.get_wiki_engine): - print 'ERROR: Don\'t mix download params and meta info params' + print ('ERROR: Don\'t mix download params and meta info params') parser.print_help() sys.exit(1) # No download params and no meta info params? Exit if (not args.xml and not args.images) and \ (not args.get_wiki_engine): - print 'ERROR: Use at least one download param or meta info param' + print ('ERROR: Use at least one download param or meta info param') parser.print_help() sys.exit(1) # Execute meta info params if args.wiki: if args.get_wiki_engine: - print getWikiEngine(url=args.wiki) + print (getWikiEngine(url=args.wiki)) sys.exit() # Create session - cj = cookielib.MozillaCookieJar() + cj = CookieJar.MozillaCookieJar() if args.cookies: cj.load(args.cookies) - print 'Using cookies from %s' % args.cookies + print ('Using cookies from %s' % args.cookies) session = requests.Session() try: @@ -1572,8 +1591,8 @@ def getParameters(params=[]): # check URLs for url in [args.api, args.index, args.wiki]: if url and (not url.startswith('http://') and not url.startswith('https://')): - print url - print 'ERROR: URLs must start with http:// or https://\n' + print (url) + print ('ERROR: URLs must start with http:// or https://\n') parser.print_help() sys.exit(1) @@ -1589,7 +1608,7 @@ def getParameters(params=[]): if not index: index = index2 else: - print 'ERROR: Unsupported wiki. Wiki engines supported are: MediaWiki' + print ('ERROR: Unsupported wiki. Wiki engines supported are: MediaWiki') sys.exit(1) else: if api == '': @@ -1597,8 +1616,8 @@ def getParameters(params=[]): elif index == '': index = '/'.join(api.split('/')[:-1]) + '/index.php' - # print api - # print index + # print (api) + # print (index) index2 = None if api: @@ -1611,26 +1630,26 @@ def getParameters(params=[]): check = checkAPI(api=api, session=session) break except requests.exceptions.ConnectionError as e: - print 'Connection error: %s'%(str(e)) + print ('Connection error: %s'%(str(e))) retry += 1 - print "Start retry attempt %d in %d seconds."%(retry+1, retrydelay) + print ("Start retry attempt %d in %d seconds."%(retry+1, retrydelay)) time.sleep(retrydelay) if api and check: index2 = check[1] api = check[2] - print 'API is OK: ' + api + print ('API is OK: ' + api) else: if index and not args.wiki: - print 'API not available. Trying with index.php only.' + print ('API not available. Trying with index.php only.') else: - print 'Error in API. Please, provide a correct path to API' + print ('Error in API. Please, provide a correct path to API') sys.exit(1) if index and checkIndex( index=index, cookies=args.cookies, session=session): - print 'index.php is OK' + print ('index.php is OK') else: index = index2 if index and index.startswith('//'): @@ -1639,7 +1658,7 @@ def getParameters(params=[]): index=index, cookies=args.cookies, session=session): - print 'index.php is OK' + print ('index.php is OK') else: try: index = '/'.join(index.split('/')[:-1]) @@ -1649,16 +1668,16 @@ def getParameters(params=[]): index=index, cookies=args.cookies, session=session): - print 'index.php is OK' + print ('index.php is OK') else: - print 'Error in index.php.' + print ('Error in index.php.') if not args.xmlrevisions: - print 'Please, provide a correct path to index.php or use --xmlrevisions. Terminating.' + print ('Please, provide a correct path to index.php or use --xmlrevisions. Terminating.') sys.exit(1) # check user and pass (one requires both) if (args.user and not args.password) or (args.password and not args.user): - print 'ERROR: Both --user and --pass are required for authentication.' + print ('ERROR: Both --user and --pass are required for authentication.') parser.print_help() sys.exit(1) @@ -1670,7 +1689,7 @@ def getParameters(params=[]): if re.search( r'[^\d, \-]', args.namespaces) and args.namespaces.lower() != 'all': - print "Invalid namespace values.\nValid format is integer(s) separated by commas" + print ("Invalid namespace values.\nValid format is integer(s) separated by commas") sys.exit() else: ns = re.sub(' ', '', args.namespaces) @@ -1682,19 +1701,19 @@ def getParameters(params=[]): # Process namespace exclusions if args.exnamespaces: if re.search(r'[^\d, \-]', args.exnamespaces): - print "Invalid namespace values.\nValid format is integer(s) separated by commas" + print ("Invalid namespace values.\nValid format is integer(s) separated by commas") sys.exit(1) else: ns = re.sub(' ', '', args.exnamespaces) if ns.lower() == 'all': - print 'You cannot exclude all namespaces.' + print ('You cannot exclude all namespaces.') sys.exit(1) else: exnamespaces = [int(i) for i in ns.split(',')] # --curonly requires --xml if args.curonly and not args.xml: - print "--curonly requires --xml\n" + print ("--curonly requires --xml\n") parser.print_help() sys.exit(1) @@ -1735,7 +1754,7 @@ def checkAPI(api=None, session=None): global cj # handle redirects for i in range(4): - print 'Checking API...', api + print ('Checking API...', api) r = session.post( url=api, data={ @@ -1750,7 +1769,7 @@ def checkAPI(api=None, session=None): p = r.url api = urlunparse([p.scheme, p.netloc, p.path, '', '', '']) elif r.status_code > 400: - print "MediaWiki API URL not found or giving error: HTTP %d" % r.status_code + print ("MediaWiki API URL not found or giving error: HTTP %d" % r.status_code) return False if "MediaWiki API is not enabled for this site." in r.text: return False @@ -1763,11 +1782,11 @@ def checkAPI(api=None, session=None): result['query']['general']['script'] return ( True, index, api ) except KeyError: - print "MediaWiki API seems to work but returned no index URL" + print ("MediaWiki API seems to work but returned no index URL") return (True, None, api) except ValueError: - print repr(r.text) - print "MediaWiki API returned data we could not parse" + print (repr(r.text)) + print ("MediaWiki API returned data we could not parse") return False return False @@ -1776,17 +1795,17 @@ def checkIndex(index=None, cookies=None, session=None): """ Checking index.php availability """ r = session.post(url=index, data={'title': 'Special:Version'}, timeout=30) raw = r.text - print 'Checking index.php...', index + print ('Checking index.php...', index) # Workaround for issue 71 if re.search( r'(Special:Badtitle|class="permissions-errors"|"wgCanonicalSpecialPageName":"Badtitle"|Login Required)', raw) and not cookies: - print "ERROR: This wiki requires login and we are not authenticated" + print ("ERROR: This wiki requires login and we are not authenticated") return False if re.search( r'(page-Index_php|"wgPageName":"Index.php"|"firstHeading">Index.php)', raw): - print "Looks like the page called Index.php, not index.php itself" + print ("Looks like the page called Index.php, not index.php itself") return False if re.search( r'(This wiki is powered by|

|meta name="generator" content="MediaWiki)', @@ -1831,7 +1850,7 @@ def checkXMLIntegrity(config={}, titles=[], session=None): """ Check XML dump integrity, to detect broken XML chunks """ return - print 'Verifying dump...' + print ('Verifying dump...') checktitles = 0 checkpageopen = 0 checkpageclose = 0 @@ -1861,21 +1880,21 @@ def checkXMLIntegrity(config={}, titles=[], session=None): if (checktitles == checkpageopen and checktitles == checkpageclose and checkrevisionopen == checkrevisionclose): pass else: - print 'XML dump seems to be corrupted.' + print ('XML dump seems to be corrupted.') reply = '' if config['failfast']: reply = 'yes' while reply.lower() not in ['yes', 'y', 'no', 'n']: - reply = raw_input('Regenerate a new dump ([yes, y], [no, n])? ') + reply = input('Regenerate a new dump ([yes, y], [no, n])? ') if reply.lower() in ['yes', 'y']: generateXMLDump(config=config, titles=titles, session=session) elif reply.lower() in ['no', 'n']: - print 'Not generating a new dump.' + print ('Not generating a new dump.') def createNewDump(config={}, other={}): images = [] - print 'Trying generating a new dump into a new directory...' + print ('Trying generating a new dump into a new directory...') if config['xml']: getPageTitles(config=config, session=other['session']) titles=readTitles(config) @@ -1898,7 +1917,7 @@ def createNewDump(config={}, other={}): def resumePreviousDump(config={}, other={}): images = [] - print 'Resuming previous dump process...' + print ('Resuming previous dump process...') if config['xml']: titles=readTitles(config) try: @@ -1914,9 +1933,9 @@ def resumePreviousDump(config={}, other={}): lasttitle = '' # probably file does not exists if lasttitle == '--END--': # titles list is complete - print 'Title list was completed in the previous session' + print ('Title list was completed in the previous session') else: - print 'Title list is incomplete. Reloading...' + print ('Title list is incomplete. Reloading...') # do not resume, reload, to avoid inconsistences, deleted pages or # so getPageTitles(config=config, session=other['session']) @@ -1948,10 +1967,10 @@ def resumePreviousDump(config={}, other={}): pass # probably file does not exists if xmliscomplete: - print 'XML dump was completed in the previous session' + print ('XML dump was completed in the previous session') elif lastxmltitle: # resuming... - print 'Resuming XML dump from "%s"' % (lastxmltitle) + print ('Resuming XML dump from "%s"' % (lastxmltitle)) titles = readTitles(config, start=lastxmltitle) generateXMLDump( config=config, @@ -1960,7 +1979,7 @@ def resumePreviousDump(config={}, other={}): session=other['session']) else: # corrupt? only has XML header? - print 'XML is corrupt? Regenerating...' + print ('XML is corrupt? Regenerating...') titles = readTitles(config) generateXMLDump( config=config, titles=titles, session=other['session']) @@ -1986,9 +2005,9 @@ def resumePreviousDump(config={}, other={}): except: pass # probably file doesnot exists if lastimage == u'--END--': - print 'Image list was completed in the previous session' + print ('Image list was completed in the previous session') else: - print 'Image list is incomplete. Reloading...' + print ('Image list is incomplete. Reloading...') # do not resume, reload, to avoid inconsistences, deleted images or # so images = getImageNames(config=config, session=other['session']) @@ -2015,10 +2034,10 @@ def resumePreviousDump(config={}, other={}): complete = False break c += 1 - print '%d images were found in the directory from a previous session' % (c) + print ('%d images were found in the directory from a previous session' % (c)) if complete: # image dump is complete - print 'Image dump was completed in the previous session' + print ('Image dump was completed in the previous session') else: # we resume from previous image, which may be corrupted (or missing # .desc) by the previous session ctrl-c or abort @@ -2038,15 +2057,15 @@ def saveSpecialVersion(config={}, session=None): """ Save Special:Version as .html, to preserve extensions details """ if os.path.exists('%s/Special:Version.html' % (config['path'])): - print 'Special:Version.html exists, do not overwrite' + print ('Special:Version.html exists, do not overwrite') else: - print 'Downloading Special:Version with extensions and other related info' + print ('Downloading Special:Version with extensions and other related info') r = session.post( url=config['index'], params={'title': 'Special:Version'}, timeout=10) raw = r.text delay(config=config, session=session) raw = removeIP(raw=raw) - with open('%s/Special:Version.html' % (config['path']), 'w') as outfile: + with open('%s/Special:Version.html' % (config['path']), 'wb') as outfile: outfile.write(raw.encode('utf-8')) @@ -2054,14 +2073,14 @@ def saveIndexPHP(config={}, session=None): """ Save index.php as .html, to preserve license details available at the botom of the page """ if os.path.exists('%s/index.html' % (config['path'])): - print 'index.html exists, do not overwrite' + print ('index.html exists, do not overwrite') else: - print 'Downloading index.php (Main Page) as index.html' + print ('Downloading index.php (Main Page) as index.html') r = session.post(url=config['index'], params={}, timeout=10) raw = r.text delay(config=config, session=session) raw = removeIP(raw=raw) - with open('%s/index.html' % (config['path']), 'w') as outfile: + with open('%s/index.html' % (config['path']), 'wb') as outfile: outfile.write(raw.encode('utf-8')) def saveSiteInfo(config={}, session=None): @@ -2069,9 +2088,9 @@ def saveSiteInfo(config={}, session=None): if config['api']: if os.path.exists('%s/siteinfo.json' % (config['path'])): - print 'siteinfo.json exists, do not overwrite' + print ('siteinfo.json exists, do not overwrite') else: - print 'Downloading site info as siteinfo.json' + print ('Downloading site info as siteinfo.json') # MediaWiki 1.13+ r = session.post( @@ -2105,8 +2124,8 @@ def saveSiteInfo(config={}, session=None): timeout=10) result = getJSON(r) delay(config=config, session=session) - with open('%s/siteinfo.json' % (config['path']), 'w') as outfile: - outfile.write(json.dumps(result, indent=4, sort_keys=True)) + with open('%s/siteinfo.json' % (config['path']), 'wb') as outfile: + outfile.write(json.dumps(result, indent=4, sort_keys=True).encode('utf-8')) def avoidWikimediaProjects(config={}, other={}): @@ -2119,12 +2138,12 @@ def avoidWikimediaProjects(config={}, other={}): if config['index']: url = url + config['index'] if re.findall( - r'(?i)(wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews|wikidata|wikivoyage)\.org', + '(?i)(wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews|wikidata|wikivoyage)\\.org', url): - print 'PLEASE, DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS!' - print 'Download the dumps from http://dumps.wikimedia.org' + print ('PLEASE, DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS!') + print ('Download the dumps from http://dumps.wikimedia.org') if not other['force']: - print 'Thanks!' + print ('Thanks!') sys.exit() @@ -2140,73 +2159,73 @@ def getWikiEngine(url=''): wikiengine = 'Unknown' if re.search( - ur'(?im)()', result): + elif re.search(u'(?im)(>MoinMoin Powered|