Downloads: wikidownloader.py

File wikidownloader.py, 7.7 KB (added by admin, 8 months ago)
Line 
1#!/usr/bin/python
2
3"""
4MIT License
5
6Copyright (c) 2016 Vit Baisa
7
8Permission is hereby granted, free of charge, to any person obtaining a copy
9of this software and associated documentation files (the "Software"), to deal
10in the Software without restriction, including without limitation the rights
11to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12copies of the Software, and to permit persons to whom the Software is
13furnished to do so, subject to the following conditions:
14
15The above copyright notice and this permission notice shall be included in all
16copies or substantial portions of the Software.
17
18THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24SOFTWARE.
25"""
26
27import os
28import sys
29import argparse
30import urllib2
31import urllib
32import datetime
33import json
34import time
35import datetime
36import gzip
37from justext import core as justext
38from lxml.etree import XMLSyntaxError, ParserError
39
40class MissingPage(Exception):
41    pass
42
43class EmptyHTML(Exception):
44    pass
45
46class EmptyJusText(Exception):
47    pass
48
49LATEST = 'https://dumps.wikimedia.org/%swiki/latest/%swiki-latest-all-titles-in-ns0.gz'
50API_HTML = 'https://%s.wikipedia.org/w/api.php?action=parse&page=%s&format=json'
51API_JSON = 'https://%s.wikipedia.org/w/api.php?action=query&prop=revisions&titles=%s&format=json'
52
53last_api_request = datetime.datetime.now()
54last_api_parse   = datetime.datetime.now()
55logf = None
56
57def html2prevert(s):
58    try:
59        html_root = justext.preprocess(html_text=s, encoding='utf-8')
60        return justext.make_paragraphs(html_root)
61    except (ParserError, XMLSyntaxError):
62        return []
63
64def api_wait(last):
65    global wait_interval
66    global logf
67    n = datetime.datetime.now()
68    interval = (n-last).seconds + ((n-last).microseconds / 1.0e6)
69    if interval < wait_interval:
70        time.sleep(wait_interval - interval)
71
72def process_article(langcode, title, linksf):
73    global last_api_parse
74    global logf
75    api_wait(last_api_parse)
76    last_api_parse = datetime.datetime.now()
77    resp = urllib2.urlopen(API_HTML % (langcode, title))
78    data = json.load(resp)
79    if 'error' in data:
80        raise MissingPage()
81    p = data['parse']
82    html = p['text']['*']
83    if html.strip():
84        pars = html2prevert(html.encode('utf-8')) # justext decodes!
85    else:
86        print >>logf, '\tempty HTML parse returned by API'
87        raise EmptyHTML()
88    if not pars:
89        print >>logf, '\tempty prevert returned by jusText'
90        raise EmptyJusText()
91    outp = []
92    for par in pars:
93        parx = justext.html_escape(par['text'])
94        outp.append(parx)
95    revid = p['revid']
96    langlinks_len = len(p['langlinks'])
97    #links = '\v'.join([d['*'].replace('"', '') for d in p['links']])
98    categories = '\v'.join([d['*'].replace('"', '') for d in p['categories']])
99    if linksf and p['externallinks']:
100        linksf.write('### %s\n' % title)
101        for line in p['externallinks']:
102            linksf.write(line + '\n')
103        linksf.write('\n')
104    s = ''
105    chars = 0
106    for p in outp:
107        s += '<p>\n'
108        s += p
109        s += '\n</p>\n'
110        chars += len(p)
111    print >>logf, '\t%d chars' % chars
112    header = '<doc title="%s" categories="%s" translations="%d" paragraphs="%d" chars="%d">\n' %\
113            (title, categories.encode('utf-8'), langlinks_len,
114                    #links.encode('utf-8'),
115                    len(outp), chars)
116    return header + s.encode('utf-8') + '</doc>\n', len(outp), revid
117
118def main(langcode, cachefn, logfn, newest, links):
119    global logf
120    logf = open(logfn, 'w')
121    print >>sys.stderr, "Log will be stored in %s" % logfn
122    linksf = open(links, 'w') if links else None
123    cache = {}
124    if os.path.exists(cachefn):
125        print >>logf, 'Cache: %s' % cachefn
126        with open(cachefn) as cf:
127            for line in cf:
128                try:
129                    title, revid = line.split('\t')
130                    cache[title.strip()] = revid.strip()
131                except ValueError:
132                    continue
133    cf = open(cachefn, 'w') # empty cache file
134    print >>logf, 'Getting all titles from latest Wikipedia dump'
135    processed_articles = 0
136    skipped_articles = 0
137    empty_articles = 0
138    filename, _ = urllib.urlretrieve(LATEST % (langcode, langcode))
139    with gzip.open(filename) as df:
140        for line in df:
141            title = line.strip().replace('"', "'")
142            print >>logf, '%s' % title
143            if title in cache:
144                if newest: # download the newest revision
145                    previous_revid = cache[title]
146                    api_wait(last_api_request)
147                    last_api_request = datetime.datetime.now()
148                    resp = urllib2.urlopen(API_JSON % (langcode, title))
149                    data = json.load(response)
150                    try:
151                        dqp = data['query']['pages']
152                        for key in dqp.keys():
153                            current_revid = dqp[key]['revisions']['revid']
154                            if previous_revid != current_revid:
155                                article, parlen, revid = process_article(langcode, title, linksf)
156                                cache[title] = revid
157                                cf.write(title + '\t' + revid + '\n')
158                            else:
159                                print >>logf, '\tskipping cached'
160                    except (MissingPage, EmptyHTML, EmptyJusText):
161                        article = ''
162                        empty_articles += 1
163                else:
164                    # do not download
165                    print >>logf, '\tskip already downloaded'
166                    skipped_articles += 1
167                    article = ''
168            else:
169                try:
170                    article, parlen, revid = process_article(langcode, title, linksf)
171                    cache[title] = revid
172                    cf.write('%s\t%d\n' % (title, revid))
173                    print >>logf, '\t%d paragraphs' % parlen
174                    processed_articles += 1
175                except (MissingPage, EmptyHTML, EmptyJusText):
176                    article = ''
177                    empty_articles += 1
178            if article:
179                sys.stdout.write(article)
180    print >>logf, 'Updated cache database stored in %s' % cachefn
181    linksf.close()
182    cf.close()
183    print >>logf, 'Processed: %d' % processed_articles
184    print >>logf, 'Empty: %d' % empty_articles
185    print >>logf, 'Skipped: %d' % skipped_articles
186    logf.close()
187
188if __name__ == '__main__':
189    parser = argparse.ArgumentParser(description='Wikipedia downloader')
190    parser.add_argument('langcode', type=str, help='Wikipedia language prefix')
191    parser.add_argument('--cache', help='Directory with previously downloaded pages and data', type=str, default='')
192    parser.add_argument('--wait', help='Time interval between GET requests', type=float, default=1.0)
193    parser.add_argument('--newest', help='Download the newest versions of articles', action='store_true')
194    parser.add_argument('--links', help='Gather external links from Wikipedia', type=str, default='')
195    args = parser.parse_args()
196    wait_interval = args.wait
197
198    current_time = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
199    logfile = current_time + '.log'
200    cachefile = args.cache or args.langcode + 'wiki.cache'
201    main(args.langcode, cachefile, logfile, args.newest, args.links)