Downloads: wiki2corpus-2.0.py

File wiki2corpus-2.0.py, 16.3 KB (added by admin, 2 years ago)
Line 
1#!/usr/bin/python3
2#coding=utf-8
3
4"""
5MIT License
6
7Copyright (c) 2020 Vit Baisa, Vit Suchomel, Marek Blahus
8
9Permission is hereby granted, free of charge, to any person obtaining a copy
10of this software and associated documentation files (the "Software"), to deal
11in the Software without restriction, including without limitation the rights
12to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13copies of the Software, and to permit persons to whom the Software is
14furnished to do so, subject to the following conditions:
15
16The above copyright notice and this permission notice shall be included in all
17copies or substantial portions of the Software.
18
19THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25SOFTWARE.
26"""
27
28"""
29MediaWiki API help:
30https://www.mediawiki.org/w/api.php?action=help&modules=query
31"""
32
33VERSION = '1.3'
34
35import re
36import os
37import sys
38import argparse
39# use requests
40import urllib.request
41from urllib.parse import quote as url_quote
42import http.client # httplib.HTTPException
43import datetime
44import json
45import time
46from justext import core as justext
47from lxml.etree import XMLSyntaxError, ParserError
48from unicodedata import category as unicode_char_category
49
50remove_links_re = re.compile('</?a[^>]*>')
51
52class PageNotFound(Exception):
53 pass
54
55class RequestTimeout(Exception):
56 pass
57
58class InvalidResponse(Exception):
59 pass
60
61class EmptyHTML(Exception):
62 pass
63
64class EmptyJusText(Exception):
65 pass
66
67LATEST = 'https://dumps.wikimedia.org/%swiki/latest/%swiki-latest-all-titles-in-ns0.gz'
68WIKI_URL = 'https://%s.wikipedia.org/wiki/'
69API_HTML = 'https://%s.wikipedia.org/w/api.php?action=parse&page=%s&format=json'
70API_JSON = 'https://%s.wikipedia.org/w/api.php?action=query&prop=revisions&titles=%s&format=json'
71
72# TODO: look at https://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=json&titles=xxx
73
74JUSTEXT_PARAMS_BY_LEVEL = {
75 'verystrict': { #Justext default
76 'length_low': 70,
77 'length_high': 200,
78 'stopwords_low': 0.3,
79 'stopwords_high': 0.32,
80 'max_link_density': 0.2,
81 'max_good_distance': 5,
82 'max_heading_distance': 150,
83 },
84 'strict': { #recommended
85 'length_low': 70,
86 'length_high': 200,
87 'stopwords_low': 0.25,
88 'stopwords_high': 0.32,
89 'max_link_density': 0.3,
90 'max_good_distance': 5,
91 'max_heading_distance': 150,
92 },
93 'balanced': {
94 'length_low': 55,
95 'length_high': 140,
96 'stopwords_low': 0.2,
97 'stopwords_high': 0.3,
98 'max_link_density': 0.4,
99 'max_good_distance': 5,
100 'max_heading_distance': 200,
101 },
102 'permissive': {
103 'length_low': 40,
104 'length_high': 90,
105 'stopwords_low': 0.2,
106 'stopwords_high': 0.3,
107 'max_link_density': 0.45,
108 'max_good_distance': 10,
109 'max_heading_distance': 300,
110 },
111}
112
113def html2prevert(s, justext_wordlist, justext_level='strict', allowshort=False):
114 # TODO: preclean HTML (remove tables, infoboxes, References, TOC, ...)
115 try:
116 html_root = justext.preprocess(html_text=s, encoding='utf-8')
117 paragraphs = justext.make_paragraphs(html_root)
118 except (ParserError, XMLSyntaxError):
119 return ('', 0, 0)
120 #use Justext to classify paragraphs
121 j_length_low = JUSTEXT_PARAMS_BY_LEVEL[justext_level]['length_low']
122 j_length_high = JUSTEXT_PARAMS_BY_LEVEL[justext_level]['length_high']
123 j_max_heading_distance = JUSTEXT_PARAMS_BY_LEVEL[justext_level]['max_heading_distance']
124 if allowshort:
125 j_length_low = j_length_low / 3
126 j_length_high = j_length_high / 3
127 j_max_heading_distance = j_max_heading_distance / 3
128 justext.classify_paragraphs(
129 paragraphs=paragraphs,
130 stoplist=justext_wordlist,
131 length_low=j_length_low, #character count < length_low => bad or short
132 length_high=j_length_high, #character count > length_high => good
133 stopwords_low=JUSTEXT_PARAMS_BY_LEVEL[justext_level]['stopwords_low'], #number of words frequent in the language >= stopwords_low => neargood
134 stopwords_high=JUSTEXT_PARAMS_BY_LEVEL[justext_level]['stopwords_high'], #number of words frequent in the language >= stopwords_high => good or neargood
135 max_link_density=JUSTEXT_PARAMS_BY_LEVEL[justext_level]['max_link_density'] #density of link words (words inside the <a> tag) > max_link_density => bad
136 )
137 justext.revise_paragraph_classification(
138 paragraphs=paragraphs,
139 max_heading_distance=j_max_heading_distance #Short/near-good heads in the distance [chars] before a good par => good
140 )
141 #extract good paragraphs
142 prevert_paragraphs, paragraph_count, plaintext_len = [], 0, 0
143 for p in paragraphs:
144 #if p['class'] == 'good': # TODO find why this does not produce a good result
145 if p['cfclass'] in ('good', 'neargood'): #'good', 'neargood', 'short', 'bad'
146 p_text = justext.html_escape(p['text']).strip()
147 if p_text:
148 paragraph_count += 1
149 plaintext_len += len(p_text)
150 heading = ' heading="yes"' if p['heading'] else ''
151 prevert_paragraphs.append('<p%s>\n%s\n</p>' % (heading, p_text))
152 return ('\n'.join(prevert_paragraphs), paragraph_count, plaintext_len)
153
154def api_wait(last, wait_interval):
155 n = datetime.datetime.now()
156 interval = (n-last).seconds + ((n-last).microseconds / 1.0e6)
157 if interval < wait_interval:
158 time.sleep(wait_interval - interval)
159
160def process_page(langcode, title, linksf, raw_response_fp, justext_wordlist, logf,
161 last_api_parse, wait_interval, justext_level, allowshort):
162 api_wait(last_api_parse, wait_interval)
163 api_url = API_HTML % (langcode, url_quote(title))
164
165 try:
166 response_data = urllib.request.urlopen(api_url, timeout=10).read()
167 except HTTPError as e:
168 raise PageNotFound()
169 except URLError as e:
170 if isinstance(e.reason, socket.timeout):
171 raise RequestTimeout()
172 else:
173 raise PageNotFound()
174 except socket.timeout as e:
175 raise RequestTimeout()
176 parse_time = datetime.datetime.now()
177 try:
178 response_data = response_data.decode('utf-8', errors='strict')
179 except UnicodeDecodeError:
180 logf.write('\tignoring a UnicodeDecodeError\n')
181 response_data = response_data.decode('utf-8', errors='ignore')
182 data = json.loads(response_data)
183 if not data or 'error' in data:
184 raise InvalidResponse()
185 #store the API response to allow re-processing without downloading in the future
186 raw_response_fp.write('%s\t%d\n' % (api_url, len(response_data) + 1))
187 raw_response_fp.write(response_data)
188 raw_response_fp.write('\n')
189 #parse the API response
190 p = data['parse']
191 html = p['text']['*'].strip()
192 if html:
193 #remove <a/> tags (Justext makes extra spaces there) # TODO: correct Justext and remove
194 html = remove_links_re.sub('', html)
195 prevert, paragraph_count, plaintext_len = html2prevert(
196 html.encode('utf-8'), justext_wordlist, justext_level, allowshort) # justext decodes!
197 else:
198 raise EmptyHTML()
199 if not prevert:
200 raise EmptyJusText()
201 revid = p['revid']
202 langlinks_len = len(p['langlinks'])
203 categories = '|'.join([d['*'].replace('"', '') for d in p['categories']])
204 if linksf and p['externallinks']:
205 linksf.write('### %s\n' % title)
206 linksf.write('%s\n' % '\n'.join(p['externallinks']))
207 logf.write('\t%d chars\n' % plaintext_len)
208 page_attrs = 'url="%s" title="%s" wiki_categories="%s" wiki_translations="%d" ' \
209 'paragraphs="%d" chars="%d" crawl_date="%s"' % \
210 ((WIKI_URL % langcode) + title, title, categories,
211 langlinks_len, paragraph_count, plaintext_len,
212 parse_time.strftime('%Y-%m-%d %H:%M'))
213 page = '<doc %s>\n%s\n</doc>\n' % (page_attrs, prevert)
214 return (page, paragraph_count, revid, parse_time)
215
216def go_page(langcode, title, linksf, raw_response_fp, newest, justext_wordlist, logf, last_api_request,
217 last_api_parse, wait_interval, justext_level, allowshort, cache, cf, hits_by_type):
218 page, parlen, revid = '', 0, 0
219 #check the cache first
220 if title in cache:
221 #download the newest revision if there is an old version in the cache
222 if newest:
223 previous_revid = cache[title]
224 api_wait(last_api_request, wait_interval)
225 last_api_request = datetime.datetime.now()
226 api_url = API_JSON % (langcode, url_quote(title))
227 resp = urllib.request.urlopen(api_url)
228 data = json.load(resp)
229 dqp = data['query']['pages']
230 for key in dqp.keys():
231 try:
232 current_revid = dqp[key]['revisions'][0]['revid']
233 except (KeyError, IndexError):
234 logf.write('\tusing old revision %s instead of the newest '
235 'revision (invalid Wiki API response data)\n' % previous_revid)
236 current_revid = previous_revid
237 #skip if cached is already newest
238 if current_revid == previous_revid:
239 hits_by_type['skipped'] += 1
240 logf.write('\tskipping cached\n')
241 return (page, parlen, revid, last_api_parse)
242 #continue to download the page otherwise
243 #skip because in cache
244 else:
245 hits_by_type['skipped'] += 1
246 logf.write('\tskipping already downloaded\n')
247 return (page, parlen, revid, last_api_parse)
248 #download the page since it is not in the cache or there is a new version
249 try:
250 page, parlen, revid, last_api_parse =\
251 process_page(langcode, title, linksf,
252 raw_response_fp, justext_wordlist, logf,
253 last_api_parse, wait_interval,
254 justext_level, allowshort)
255 hits_by_type['processed'] += 1
256 logf.write('\t%d paragraphs\n' % parlen)
257 except Exception as e: #PageNotFound, RequestTimeout, InvalidResponse, EmptyHTML, EmptyJusText
258 page = ''
259 hits_by_type['empty'] += 1
260 log_msg = {
261 'PageNotFound': '\tskipped -- page not found',
262 'RequestTimeout': '\tskipped -- request timeout',
263 'InvalidResponse': '\tskipped -- invalid response',
264 'EmptyHTML': '\tempty HTML parse returned by API',
265 'EmptyJusText': '\tempty prevert returned by jusText'
266 }.get(type(e).__name__, 'tskipped or empty -- %s' % type(e).__name__)
267 logf.write('%s\n' % log_msg)
268 #update the cache (previous records for the same tile are replaced when reloading the cache)
269 if newest:
270 cache[title] = revid #zero if page not found/empty/exception
271 else:
272 cache.add(title)
273 cf.write('%s\t%s\n' % (title, revid))
274 return (page, parlen, revid, last_api_parse)
275
276def main(langcode, cachefn, raw_response_path, newest, links, justext_wordlist,
277 logf, wait_interval, nicetitles, talkpages, justext_level, allowshort,
278 title_file_path=''):
279 last_api_request = datetime.datetime.now()
280 last_api_parse = datetime.datetime.now()
281
282 if len(justext_wordlist) == 0:
283 logf.write('Wordlist file is empty, switching off stopwords detection.\n')
284 JUSTEXT_PARAMS_BY_LEVEL[justext_level]['stopwords_low'] = 0
285 JUSTEXT_PARAMS_BY_LEVEL[justext_level]['stopwords_high'] = 0
286
287 linksf = open(links, 'a') if links else None
288 cache = {} if newest else set()
289 if os.path.exists(cachefn):
290 logf.write('Cache: %s\n' % cachefn)
291 with open(cachefn) as cf:
292 for line in cf:
293 try:
294 title, revid = line.split('\t')
295 if newest:
296 cache[title.strip()] = revid.strip()
297 else:
298 cache.add(title.strip())
299 except ValueError:
300 continue
301 logf.write('Cache: %d titles loaded\n' % len(cache))
302
303 langcode2 = langcode.replace('-', '_')
304 wikidump_titles_url = LATEST % (langcode2, langcode2)
305 if not title_file_path:
306 title_file_path = wikidump_titles_url.rsplit('/', 1)[-1].replace('.gz', '')
307 if not os.path.exists(title_file_path):
308 logf.write('Getting all titles from latest Wikipedia dump %s to %s\n' %
309 (wikidump_titles_url, title_file_path))
310 wiki_title_data = urllib.request.urlopen(wikidump_titles_url).read()
311 if wikidump_titles_url.endswith('.gz'):
312 from io import BytesIO
313 from gzip import GzipFile
314 bio = BytesIO(wiki_title_data)
315 bio.seek(0)
316 wiki_title_data = GzipFile(fileobj=bio).read()
317 with open(title_file_path, 'wb') as title_file:
318 title_file.write(wiki_title_data)
319
320 cf = open(cachefn, 'at') # cache file
321 raw_response_fp = open(raw_response_path, 'at') #raw API responses
322 hits_by_type = {'processed': 0, 'skipped': 0, 'empty': 0}
323 for line in open(title_file_path, 'rt', encoding='utf-8', errors='ignore'):
324 title = line.strip().replace('"', "'")
325 # TODO: filter titles, use RE as parameter
326 logf.write('%s\n' % title)
327 if not title or nicetitles and not unicode_char_category(title[0])[0] in ('L', 'N'):
328 logf.write('\tskipping (not a nice title)\n')
329 continue
330 page_titles = (title, 'Talk:' + title) if talkpages else (title,)
331 for page_title in page_titles:
332 page, parlen, revid, last_api_parse =\
333 go_page(langcode, page_title, linksf,
334 raw_response_fp, newest, justext_wordlist, logf,
335 last_api_request, last_api_parse, wait_interval,
336 justext_level, allowshort, cache, cf, hits_by_type)
337 if page:
338 sys.stdout.write(page)
339 logf.write('Updated cache database stored in %s\n' % cachefn)
340 if linksf:
341 linksf.close()
342 cf.close()
343 for hit_type, hit_count in hits_by_type.items():
344 logf.write('%s: %d\n' % (hit_type.title(), hit_count))
345
346if __name__ == '__main__':
347 parser = argparse.ArgumentParser(description='Wikipedia downloader')
348 parser.add_argument('langcode', help='Wikipedia language prefix', type=str)
349 parser.add_argument('wordlist', help='Path to a list of ~2000 most frequent words in the language (UTF-8, one per line)', type=str)
350 parser.add_argument('--cache', help='File with previously downloaded pages and data', type=str, default='')
351 parser.add_argument('--wait', help='Time interval between GET requests', type=float, default=1.0)
352 parser.add_argument('--newest', help='Download the newest versions of articles', action='store_true')
353 parser.add_argument('--links', help='Gather external links from Wikipedia', type=str, default='')
354 parser.add_argument('--nicetitles', help='Download only titles starting with an alphabetical or numerical character', action='store_true')
355 parser.add_argument('--talkpages', help='Download talk pages', action='store_true')
356 parser.add_argument('--cleaning', help='Level of Justext boilerplate & short paragraph removal strictness (default = strict)',
357 type=str, choices=('verystrict', 'strict', 'balanced', 'permissive'), default='strict')
358 parser.add_argument('--allowshort', help='Allow three times shorter texts. Useful for ideographic scripts.', action='store_true')
359 parser.add_argument('--title-file', help='Path to a custom list of titles to download, one per line.', type=str, default='')
360 args = parser.parse_args()
361 cachefile = args.cache or args.langcode + 'wiki.cache'
362 raw_response_file = (args.cache or args.langcode) + '_raw_data'
363 with open(args.wordlist) as fp:
364 justext_wordlist = set([line.rstrip() for line in fp])
365 logfn = args.langcode.replace('/','') + '_' + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '.log'
366 with open(logfn, 'w', buffering=1) as logfile:
367 main(args.langcode, cachefile, raw_response_file, args.newest, args.links,
368 justext_wordlist, logfile, args.wait, args.nicetitles, args.talkpages,
369 args.cleaning, args.allowshort, args.title_file)