1 | #!/usr/bin/python3 |
---|
2 | #coding=utf-8 |
---|
3 | |
---|
4 | """ |
---|
5 | MIT License |
---|
6 | |
---|
7 | Copyright (c) 2020 Vit Baisa, Vit Suchomel, Marek Blahus |
---|
8 | |
---|
9 | Permission is hereby granted, free of charge, to any person obtaining a copy |
---|
10 | of this software and associated documentation files (the "Software"), to deal |
---|
11 | in the Software without restriction, including without limitation the rights |
---|
12 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
---|
13 | copies of the Software, and to permit persons to whom the Software is |
---|
14 | furnished to do so, subject to the following conditions: |
---|
15 | |
---|
16 | The above copyright notice and this permission notice shall be included in all |
---|
17 | copies or substantial portions of the Software. |
---|
18 | |
---|
19 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
---|
20 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
---|
21 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
---|
22 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
---|
23 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
---|
24 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
---|
25 | SOFTWARE. |
---|
26 | """ |
---|
27 | |
---|
28 | """ |
---|
29 | MediaWiki API help: |
---|
30 | https://www.mediawiki.org/w/api.php?action=help&modules=query |
---|
31 | """ |
---|
32 | |
---|
33 | VERSION = '1.3' |
---|
34 | |
---|
35 | import re |
---|
36 | import os |
---|
37 | import sys |
---|
38 | import argparse |
---|
39 | # use requests |
---|
40 | import urllib.request |
---|
41 | from urllib.parse import quote as url_quote |
---|
42 | import http.client # httplib.HTTPException |
---|
43 | import datetime |
---|
44 | import json |
---|
45 | import time |
---|
46 | from justext import core as justext |
---|
47 | from lxml.etree import XMLSyntaxError, ParserError |
---|
48 | from unicodedata import category as unicode_char_category |
---|
49 | |
---|
50 | remove_links_re = re.compile('</?a[^>]*>') |
---|
51 | |
---|
52 | class PageNotFound(Exception): |
---|
53 | pass |
---|
54 | |
---|
55 | class RequestTimeout(Exception): |
---|
56 | pass |
---|
57 | |
---|
58 | class InvalidResponse(Exception): |
---|
59 | pass |
---|
60 | |
---|
61 | class EmptyHTML(Exception): |
---|
62 | pass |
---|
63 | |
---|
64 | class EmptyJusText(Exception): |
---|
65 | pass |
---|
66 | |
---|
67 | LATEST = 'https://dumps.wikimedia.org/%swiki/latest/%swiki-latest-all-titles-in-ns0.gz' |
---|
68 | WIKI_URL = 'https://%s.wikipedia.org/wiki/' |
---|
69 | API_HTML = 'https://%s.wikipedia.org/w/api.php?action=parse&page=%s&format=json' |
---|
70 | API_JSON = 'https://%s.wikipedia.org/w/api.php?action=query&prop=revisions&titles=%s&format=json' |
---|
71 | |
---|
72 | # TODO: look at https://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=json&titles=xxx |
---|
73 | |
---|
74 | JUSTEXT_PARAMS_BY_LEVEL = { |
---|
75 | 'verystrict': { #Justext default |
---|
76 | 'length_low': 70, |
---|
77 | 'length_high': 200, |
---|
78 | 'stopwords_low': 0.3, |
---|
79 | 'stopwords_high': 0.32, |
---|
80 | 'max_link_density': 0.2, |
---|
81 | 'max_good_distance': 5, |
---|
82 | 'max_heading_distance': 150, |
---|
83 | }, |
---|
84 | 'strict': { #recommended |
---|
85 | 'length_low': 70, |
---|
86 | 'length_high': 200, |
---|
87 | 'stopwords_low': 0.25, |
---|
88 | 'stopwords_high': 0.32, |
---|
89 | 'max_link_density': 0.3, |
---|
90 | 'max_good_distance': 5, |
---|
91 | 'max_heading_distance': 150, |
---|
92 | }, |
---|
93 | 'balanced': { |
---|
94 | 'length_low': 55, |
---|
95 | 'length_high': 140, |
---|
96 | 'stopwords_low': 0.2, |
---|
97 | 'stopwords_high': 0.3, |
---|
98 | 'max_link_density': 0.4, |
---|
99 | 'max_good_distance': 5, |
---|
100 | 'max_heading_distance': 200, |
---|
101 | }, |
---|
102 | 'permissive': { |
---|
103 | 'length_low': 40, |
---|
104 | 'length_high': 90, |
---|
105 | 'stopwords_low': 0.2, |
---|
106 | 'stopwords_high': 0.3, |
---|
107 | 'max_link_density': 0.45, |
---|
108 | 'max_good_distance': 10, |
---|
109 | 'max_heading_distance': 300, |
---|
110 | }, |
---|
111 | } |
---|
112 | |
---|
113 | def html2prevert(s, justext_wordlist, justext_level='strict', allowshort=False): |
---|
114 | # TODO: preclean HTML (remove tables, infoboxes, References, TOC, ...) |
---|
115 | try: |
---|
116 | html_root = justext.preprocess(html_text=s, encoding='utf-8') |
---|
117 | paragraphs = justext.make_paragraphs(html_root) |
---|
118 | except (ParserError, XMLSyntaxError): |
---|
119 | return ('', 0, 0) |
---|
120 | #use Justext to classify paragraphs |
---|
121 | j_length_low = JUSTEXT_PARAMS_BY_LEVEL[justext_level]['length_low'] |
---|
122 | j_length_high = JUSTEXT_PARAMS_BY_LEVEL[justext_level]['length_high'] |
---|
123 | j_max_heading_distance = JUSTEXT_PARAMS_BY_LEVEL[justext_level]['max_heading_distance'] |
---|
124 | if allowshort: |
---|
125 | j_length_low = j_length_low / 3 |
---|
126 | j_length_high = j_length_high / 3 |
---|
127 | j_max_heading_distance = j_max_heading_distance / 3 |
---|
128 | justext.classify_paragraphs( |
---|
129 | paragraphs=paragraphs, |
---|
130 | stoplist=justext_wordlist, |
---|
131 | length_low=j_length_low, #character count < length_low => bad or short |
---|
132 | length_high=j_length_high, #character count > length_high => good |
---|
133 | stopwords_low=JUSTEXT_PARAMS_BY_LEVEL[justext_level]['stopwords_low'], #number of words frequent in the language >= stopwords_low => neargood |
---|
134 | stopwords_high=JUSTEXT_PARAMS_BY_LEVEL[justext_level]['stopwords_high'], #number of words frequent in the language >= stopwords_high => good or neargood |
---|
135 | max_link_density=JUSTEXT_PARAMS_BY_LEVEL[justext_level]['max_link_density'] #density of link words (words inside the <a> tag) > max_link_density => bad |
---|
136 | ) |
---|
137 | justext.revise_paragraph_classification( |
---|
138 | paragraphs=paragraphs, |
---|
139 | max_heading_distance=j_max_heading_distance #Short/near-good heads in the distance [chars] before a good par => good |
---|
140 | ) |
---|
141 | #extract good paragraphs |
---|
142 | prevert_paragraphs, paragraph_count, plaintext_len = [], 0, 0 |
---|
143 | for p in paragraphs: |
---|
144 | #if p['class'] == 'good': # TODO find why this does not produce a good result |
---|
145 | if p['cfclass'] in ('good', 'neargood'): #'good', 'neargood', 'short', 'bad' |
---|
146 | p_text = justext.html_escape(p['text']).strip() |
---|
147 | if p_text: |
---|
148 | paragraph_count += 1 |
---|
149 | plaintext_len += len(p_text) |
---|
150 | heading = ' heading="yes"' if p['heading'] else '' |
---|
151 | prevert_paragraphs.append('<p%s>\n%s\n</p>' % (heading, p_text)) |
---|
152 | return ('\n'.join(prevert_paragraphs), paragraph_count, plaintext_len) |
---|
153 | |
---|
154 | def api_wait(last, wait_interval): |
---|
155 | n = datetime.datetime.now() |
---|
156 | interval = (n-last).seconds + ((n-last).microseconds / 1.0e6) |
---|
157 | if interval < wait_interval: |
---|
158 | time.sleep(wait_interval - interval) |
---|
159 | |
---|
160 | def process_page(langcode, title, linksf, raw_response_fp, justext_wordlist, logf, |
---|
161 | last_api_parse, wait_interval, justext_level, allowshort): |
---|
162 | api_wait(last_api_parse, wait_interval) |
---|
163 | api_url = API_HTML % (langcode, url_quote(title)) |
---|
164 | |
---|
165 | try: |
---|
166 | response_data = urllib.request.urlopen(api_url, timeout=10).read() |
---|
167 | except HTTPError as e: |
---|
168 | raise PageNotFound() |
---|
169 | except URLError as e: |
---|
170 | if isinstance(e.reason, socket.timeout): |
---|
171 | raise RequestTimeout() |
---|
172 | else: |
---|
173 | raise PageNotFound() |
---|
174 | except socket.timeout as e: |
---|
175 | raise RequestTimeout() |
---|
176 | parse_time = datetime.datetime.now() |
---|
177 | try: |
---|
178 | response_data = response_data.decode('utf-8', errors='strict') |
---|
179 | except UnicodeDecodeError: |
---|
180 | logf.write('\tignoring a UnicodeDecodeError\n') |
---|
181 | response_data = response_data.decode('utf-8', errors='ignore') |
---|
182 | data = json.loads(response_data) |
---|
183 | if not data or 'error' in data: |
---|
184 | raise InvalidResponse() |
---|
185 | #store the API response to allow re-processing without downloading in the future |
---|
186 | raw_response_fp.write('%s\t%d\n' % (api_url, len(response_data) + 1)) |
---|
187 | raw_response_fp.write(response_data) |
---|
188 | raw_response_fp.write('\n') |
---|
189 | #parse the API response |
---|
190 | p = data['parse'] |
---|
191 | html = p['text']['*'].strip() |
---|
192 | if html: |
---|
193 | #remove <a/> tags (Justext makes extra spaces there) # TODO: correct Justext and remove |
---|
194 | html = remove_links_re.sub('', html) |
---|
195 | prevert, paragraph_count, plaintext_len = html2prevert( |
---|
196 | html.encode('utf-8'), justext_wordlist, justext_level, allowshort) # justext decodes! |
---|
197 | else: |
---|
198 | raise EmptyHTML() |
---|
199 | if not prevert: |
---|
200 | raise EmptyJusText() |
---|
201 | revid = p['revid'] |
---|
202 | langlinks_len = len(p['langlinks']) |
---|
203 | categories = '|'.join([d['*'].replace('"', '') for d in p['categories']]) |
---|
204 | if linksf and p['externallinks']: |
---|
205 | linksf.write('### %s\n' % title) |
---|
206 | linksf.write('%s\n' % '\n'.join(p['externallinks'])) |
---|
207 | logf.write('\t%d chars\n' % plaintext_len) |
---|
208 | page_attrs = 'url="%s" title="%s" wiki_categories="%s" wiki_translations="%d" ' \ |
---|
209 | 'paragraphs="%d" chars="%d" crawl_date="%s"' % \ |
---|
210 | ((WIKI_URL % langcode) + title, title, categories, |
---|
211 | langlinks_len, paragraph_count, plaintext_len, |
---|
212 | parse_time.strftime('%Y-%m-%d %H:%M')) |
---|
213 | page = '<doc %s>\n%s\n</doc>\n' % (page_attrs, prevert) |
---|
214 | return (page, paragraph_count, revid, parse_time) |
---|
215 | |
---|
216 | def go_page(langcode, title, linksf, raw_response_fp, newest, justext_wordlist, logf, last_api_request, |
---|
217 | last_api_parse, wait_interval, justext_level, allowshort, cache, cf, hits_by_type): |
---|
218 | page, parlen, revid = '', 0, 0 |
---|
219 | #check the cache first |
---|
220 | if title in cache: |
---|
221 | #download the newest revision if there is an old version in the cache |
---|
222 | if newest: |
---|
223 | previous_revid = cache[title] |
---|
224 | api_wait(last_api_request, wait_interval) |
---|
225 | last_api_request = datetime.datetime.now() |
---|
226 | api_url = API_JSON % (langcode, url_quote(title)) |
---|
227 | resp = urllib.request.urlopen(api_url) |
---|
228 | data = json.load(resp) |
---|
229 | dqp = data['query']['pages'] |
---|
230 | for key in dqp.keys(): |
---|
231 | try: |
---|
232 | current_revid = dqp[key]['revisions'][0]['revid'] |
---|
233 | except (KeyError, IndexError): |
---|
234 | logf.write('\tusing old revision %s instead of the newest ' |
---|
235 | 'revision (invalid Wiki API response data)\n' % previous_revid) |
---|
236 | current_revid = previous_revid |
---|
237 | #skip if cached is already newest |
---|
238 | if current_revid == previous_revid: |
---|
239 | hits_by_type['skipped'] += 1 |
---|
240 | logf.write('\tskipping cached\n') |
---|
241 | return (page, parlen, revid, last_api_parse) |
---|
242 | #continue to download the page otherwise |
---|
243 | #skip because in cache |
---|
244 | else: |
---|
245 | hits_by_type['skipped'] += 1 |
---|
246 | logf.write('\tskipping already downloaded\n') |
---|
247 | return (page, parlen, revid, last_api_parse) |
---|
248 | #download the page since it is not in the cache or there is a new version |
---|
249 | try: |
---|
250 | page, parlen, revid, last_api_parse =\ |
---|
251 | process_page(langcode, title, linksf, |
---|
252 | raw_response_fp, justext_wordlist, logf, |
---|
253 | last_api_parse, wait_interval, |
---|
254 | justext_level, allowshort) |
---|
255 | hits_by_type['processed'] += 1 |
---|
256 | logf.write('\t%d paragraphs\n' % parlen) |
---|
257 | except Exception as e: #PageNotFound, RequestTimeout, InvalidResponse, EmptyHTML, EmptyJusText |
---|
258 | page = '' |
---|
259 | hits_by_type['empty'] += 1 |
---|
260 | log_msg = { |
---|
261 | 'PageNotFound': '\tskipped -- page not found', |
---|
262 | 'RequestTimeout': '\tskipped -- request timeout', |
---|
263 | 'InvalidResponse': '\tskipped -- invalid response', |
---|
264 | 'EmptyHTML': '\tempty HTML parse returned by API', |
---|
265 | 'EmptyJusText': '\tempty prevert returned by jusText' |
---|
266 | }.get(type(e).__name__, 'tskipped or empty -- %s' % type(e).__name__) |
---|
267 | logf.write('%s\n' % log_msg) |
---|
268 | #update the cache (previous records for the same tile are replaced when reloading the cache) |
---|
269 | if newest: |
---|
270 | cache[title] = revid #zero if page not found/empty/exception |
---|
271 | else: |
---|
272 | cache.add(title) |
---|
273 | cf.write('%s\t%s\n' % (title, revid)) |
---|
274 | return (page, parlen, revid, last_api_parse) |
---|
275 | |
---|
276 | def main(langcode, cachefn, raw_response_path, newest, links, justext_wordlist, |
---|
277 | logf, wait_interval, nicetitles, talkpages, justext_level, allowshort, |
---|
278 | title_file_path=''): |
---|
279 | last_api_request = datetime.datetime.now() |
---|
280 | last_api_parse = datetime.datetime.now() |
---|
281 | |
---|
282 | if len(justext_wordlist) == 0: |
---|
283 | logf.write('Wordlist file is empty, switching off stopwords detection.\n') |
---|
284 | JUSTEXT_PARAMS_BY_LEVEL[justext_level]['stopwords_low'] = 0 |
---|
285 | JUSTEXT_PARAMS_BY_LEVEL[justext_level]['stopwords_high'] = 0 |
---|
286 | |
---|
287 | linksf = open(links, 'a') if links else None |
---|
288 | cache = {} if newest else set() |
---|
289 | if os.path.exists(cachefn): |
---|
290 | logf.write('Cache: %s\n' % cachefn) |
---|
291 | with open(cachefn) as cf: |
---|
292 | for line in cf: |
---|
293 | try: |
---|
294 | title, revid = line.split('\t') |
---|
295 | if newest: |
---|
296 | cache[title.strip()] = revid.strip() |
---|
297 | else: |
---|
298 | cache.add(title.strip()) |
---|
299 | except ValueError: |
---|
300 | continue |
---|
301 | logf.write('Cache: %d titles loaded\n' % len(cache)) |
---|
302 | |
---|
303 | langcode2 = langcode.replace('-', '_') |
---|
304 | wikidump_titles_url = LATEST % (langcode2, langcode2) |
---|
305 | if not title_file_path: |
---|
306 | title_file_path = wikidump_titles_url.rsplit('/', 1)[-1].replace('.gz', '') |
---|
307 | if not os.path.exists(title_file_path): |
---|
308 | logf.write('Getting all titles from latest Wikipedia dump %s to %s\n' % |
---|
309 | (wikidump_titles_url, title_file_path)) |
---|
310 | wiki_title_data = urllib.request.urlopen(wikidump_titles_url).read() |
---|
311 | if wikidump_titles_url.endswith('.gz'): |
---|
312 | from io import BytesIO |
---|
313 | from gzip import GzipFile |
---|
314 | bio = BytesIO(wiki_title_data) |
---|
315 | bio.seek(0) |
---|
316 | wiki_title_data = GzipFile(fileobj=bio).read() |
---|
317 | with open(title_file_path, 'wb') as title_file: |
---|
318 | title_file.write(wiki_title_data) |
---|
319 | |
---|
320 | cf = open(cachefn, 'at') # cache file |
---|
321 | raw_response_fp = open(raw_response_path, 'at') #raw API responses |
---|
322 | hits_by_type = {'processed': 0, 'skipped': 0, 'empty': 0} |
---|
323 | for line in open(title_file_path, 'rt', encoding='utf-8', errors='ignore'): |
---|
324 | title = line.strip().replace('"', "'") |
---|
325 | # TODO: filter titles, use RE as parameter |
---|
326 | logf.write('%s\n' % title) |
---|
327 | if not title or nicetitles and not unicode_char_category(title[0])[0] in ('L', 'N'): |
---|
328 | logf.write('\tskipping (not a nice title)\n') |
---|
329 | continue |
---|
330 | page_titles = (title, 'Talk:' + title) if talkpages else (title,) |
---|
331 | for page_title in page_titles: |
---|
332 | page, parlen, revid, last_api_parse =\ |
---|
333 | go_page(langcode, page_title, linksf, |
---|
334 | raw_response_fp, newest, justext_wordlist, logf, |
---|
335 | last_api_request, last_api_parse, wait_interval, |
---|
336 | justext_level, allowshort, cache, cf, hits_by_type) |
---|
337 | if page: |
---|
338 | sys.stdout.write(page) |
---|
339 | logf.write('Updated cache database stored in %s\n' % cachefn) |
---|
340 | if linksf: |
---|
341 | linksf.close() |
---|
342 | cf.close() |
---|
343 | for hit_type, hit_count in hits_by_type.items(): |
---|
344 | logf.write('%s: %d\n' % (hit_type.title(), hit_count)) |
---|
345 | |
---|
346 | if __name__ == '__main__': |
---|
347 | parser = argparse.ArgumentParser(description='Wikipedia downloader') |
---|
348 | parser.add_argument('langcode', help='Wikipedia language prefix', type=str) |
---|
349 | parser.add_argument('wordlist', help='Path to a list of ~2000 most frequent words in the language (UTF-8, one per line)', type=str) |
---|
350 | parser.add_argument('--cache', help='File with previously downloaded pages and data', type=str, default='') |
---|
351 | parser.add_argument('--wait', help='Time interval between GET requests', type=float, default=1.0) |
---|
352 | parser.add_argument('--newest', help='Download the newest versions of articles', action='store_true') |
---|
353 | parser.add_argument('--links', help='Gather external links from Wikipedia', type=str, default='') |
---|
354 | parser.add_argument('--nicetitles', help='Download only titles starting with an alphabetical or numerical character', action='store_true') |
---|
355 | parser.add_argument('--talkpages', help='Download talk pages', action='store_true') |
---|
356 | parser.add_argument('--cleaning', help='Level of Justext boilerplate & short paragraph removal strictness (default = strict)', |
---|
357 | type=str, choices=('verystrict', 'strict', 'balanced', 'permissive'), default='strict') |
---|
358 | parser.add_argument('--allowshort', help='Allow three times shorter texts. Useful for ideographic scripts.', action='store_true') |
---|
359 | parser.add_argument('--title-file', help='Path to a custom list of titles to download, one per line.', type=str, default='') |
---|
360 | args = parser.parse_args() |
---|
361 | cachefile = args.cache or args.langcode + 'wiki.cache' |
---|
362 | raw_response_file = (args.cache or args.langcode) + '_raw_data' |
---|
363 | with open(args.wordlist) as fp: |
---|
364 | justext_wordlist = set([line.rstrip() for line in fp]) |
---|
365 | logfn = args.langcode.replace('/','') + '_' + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '.log' |
---|
366 | with open(logfn, 'w', buffering=1) as logfile: |
---|
367 | main(args.langcode, cachefile, raw_response_file, args.newest, args.links, |
---|
368 | justext_wordlist, logfile, args.wait, args.nicetitles, args.talkpages, |
---|
369 | args.cleaning, args.allowshort, args.title_file) |
---|