Downloads: wikidownloader.py

File wikidownloader.py, 7.7 KB (added by admin, 7 years ago)
Line 
1#!/usr/bin/python
2
3"""
4MIT License
5
6Copyright (c) 2016 Vit Baisa
7
8Permission is hereby granted, free of charge, to any person obtaining a copy
9of this software and associated documentation files (the "Software"), to deal
10in the Software without restriction, including without limitation the rights
11to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12copies of the Software, and to permit persons to whom the Software is
13furnished to do so, subject to the following conditions:
14
15The above copyright notice and this permission notice shall be included in all
16copies or substantial portions of the Software.
17
18THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24SOFTWARE.
25"""
26
27import os
28import sys
29import argparse
30import urllib2
31import urllib
32import datetime
33import json
34import time
35import datetime
36import gzip
37from justext import core as justext
38from lxml.etree import XMLSyntaxError, ParserError
39
40class MissingPage(Exception):
41 pass
42
43class EmptyHTML(Exception):
44 pass
45
46class EmptyJusText(Exception):
47 pass
48
49LATEST = 'https://dumps.wikimedia.org/%swiki/latest/%swiki-latest-all-titles-in-ns0.gz'
50API_HTML = 'https://%s.wikipedia.org/w/api.php?action=parse&page=%s&format=json'
51API_JSON = 'https://%s.wikipedia.org/w/api.php?action=query&prop=revisions&titles=%s&format=json'
52
53last_api_request = datetime.datetime.now()
54last_api_parse = datetime.datetime.now()
55logf = None
56
57def html2prevert(s):
58 try:
59 html_root = justext.preprocess(html_text=s, encoding='utf-8')
60 return justext.make_paragraphs(html_root)
61 except (ParserError, XMLSyntaxError):
62 return []
63
64def api_wait(last):
65 global wait_interval
66 global logf
67 n = datetime.datetime.now()
68 interval = (n-last).seconds + ((n-last).microseconds / 1.0e6)
69 if interval < wait_interval:
70 time.sleep(wait_interval - interval)
71
72def process_article(langcode, title, linksf):
73 global last_api_parse
74 global logf
75 api_wait(last_api_parse)
76 last_api_parse = datetime.datetime.now()
77 resp = urllib2.urlopen(API_HTML % (langcode, title))
78 data = json.load(resp)
79 if 'error' in data:
80 raise MissingPage()
81 p = data['parse']
82 html = p['text']['*']
83 if html.strip():
84 pars = html2prevert(html.encode('utf-8')) # justext decodes!
85 else:
86 print >>logf, '\tempty HTML parse returned by API'
87 raise EmptyHTML()
88 if not pars:
89 print >>logf, '\tempty prevert returned by jusText'
90 raise EmptyJusText()
91 outp = []
92 for par in pars:
93 parx = justext.html_escape(par['text'])
94 outp.append(parx)
95 revid = p['revid']
96 langlinks_len = len(p['langlinks'])
97 #links = '\v'.join([d['*'].replace('"', '') for d in p['links']])
98 categories = '\v'.join([d['*'].replace('"', '') for d in p['categories']])
99 if linksf and p['externallinks']:
100 linksf.write('### %s\n' % title)
101 for line in p['externallinks']:
102 linksf.write(line + '\n')
103 linksf.write('\n')
104 s = ''
105 chars = 0
106 for p in outp:
107 s += '<p>\n'
108 s += p
109 s += '\n</p>\n'
110 chars += len(p)
111 print >>logf, '\t%d chars' % chars
112 header = '<doc title="%s" categories="%s" translations="%d" paragraphs="%d" chars="%d">\n' %\
113 (title, categories.encode('utf-8'), langlinks_len,
114 #links.encode('utf-8'),
115 len(outp), chars)
116 return header + s.encode('utf-8') + '</doc>\n', len(outp), revid
117
118def main(langcode, cachefn, logfn, newest, links):
119 global logf
120 logf = open(logfn, 'w')
121 print >>sys.stderr, "Log will be stored in %s" % logfn
122 linksf = open(links, 'w') if links else None
123 cache = {}
124 if os.path.exists(cachefn):
125 print >>logf, 'Cache: %s' % cachefn
126 with open(cachefn) as cf:
127 for line in cf:
128 try:
129 title, revid = line.split('\t')
130 cache[title.strip()] = revid.strip()
131 except ValueError:
132 continue
133 cf = open(cachefn, 'w') # empty cache file
134 print >>logf, 'Getting all titles from latest Wikipedia dump'
135 processed_articles = 0
136 skipped_articles = 0
137 empty_articles = 0
138 filename, _ = urllib.urlretrieve(LATEST % (langcode, langcode))
139 with gzip.open(filename) as df:
140 for line in df:
141 title = line.strip().replace('"', "'")
142 print >>logf, '%s' % title
143 if title in cache:
144 if newest: # download the newest revision
145 previous_revid = cache[title]
146 api_wait(last_api_request)
147 last_api_request = datetime.datetime.now()
148 resp = urllib2.urlopen(API_JSON % (langcode, title))
149 data = json.load(response)
150 try:
151 dqp = data['query']['pages']
152 for key in dqp.keys():
153 current_revid = dqp[key]['revisions']['revid']
154 if previous_revid != current_revid:
155 article, parlen, revid = process_article(langcode, title, linksf)
156 cache[title] = revid
157 cf.write(title + '\t' + revid + '\n')
158 else:
159 print >>logf, '\tskipping cached'
160 except (MissingPage, EmptyHTML, EmptyJusText):
161 article = ''
162 empty_articles += 1
163 else:
164 # do not download
165 print >>logf, '\tskip already downloaded'
166 skipped_articles += 1
167 article = ''
168 else:
169 try:
170 article, parlen, revid = process_article(langcode, title, linksf)
171 cache[title] = revid
172 cf.write('%s\t%d\n' % (title, revid))
173 print >>logf, '\t%d paragraphs' % parlen
174 processed_articles += 1
175 except (MissingPage, EmptyHTML, EmptyJusText):
176 article = ''
177 empty_articles += 1
178 if article:
179 sys.stdout.write(article)
180 print >>logf, 'Updated cache database stored in %s' % cachefn
181 linksf.close()
182 cf.close()
183 print >>logf, 'Processed: %d' % processed_articles
184 print >>logf, 'Empty: %d' % empty_articles
185 print >>logf, 'Skipped: %d' % skipped_articles
186 logf.close()
187
188if __name__ == '__main__':
189 parser = argparse.ArgumentParser(description='Wikipedia downloader')
190 parser.add_argument('langcode', type=str, help='Wikipedia language prefix')
191 parser.add_argument('--cache', help='Directory with previously downloaded pages and data', type=str, default='')
192 parser.add_argument('--wait', help='Time interval between GET requests', type=float, default=1.0)
193 parser.add_argument('--newest', help='Download the newest versions of articles', action='store_true')
194 parser.add_argument('--links', help='Gather external links from Wikipedia', type=str, default='')
195 args = parser.parse_args()
196 wait_interval = args.wait
197
198 current_time = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
199 logfile = current_time + '.log'
200 cachefile = args.cache or args.langcode + 'wiki.cache'
201 main(args.langcode, cachefile, logfile, args.newest, args.links)