Downloads: wiki2corpus-2.0.py

File wiki2corpus-2.0.py, 16.3 KB (added by admin, 2 years ago)

Line
1	#!/usr/bin/python3
2	#coding=utf-8
3
4	"""
5	MIT License
6
7	Copyright (c) 2020 Vit Baisa, Vit Suchomel, Marek Blahus
8
9	Permission is hereby granted, free of charge, to any person obtaining a copy
10	of this software and associated documentation files (the "Software"), to deal
11	in the Software without restriction, including without limitation the rights
12	to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13	copies of the Software, and to permit persons to whom the Software is
14	furnished to do so, subject to the following conditions:
15
16	The above copyright notice and this permission notice shall be included in all
17	copies or substantial portions of the Software.
18
19	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20	IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21	FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22	AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23	LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24	OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25	SOFTWARE.
26	"""
27
28	"""
29	MediaWiki API help:
30	https://www.mediawiki.org/w/api.php?action=help&modules=query
31	"""
32
33	VERSION = '1.3'
34
35	import re
36	import os
37	import sys
38	import argparse
39	# use requests
40	import urllib.request
41	from urllib.parse import quote as url_quote
42	import http.client # httplib.HTTPException
43	import datetime
44	import json
45	import time
46	from justext import core as justext
47	from lxml.etree import XMLSyntaxError, ParserError
48	from unicodedata import category as unicode_char_category
49
50	remove_links_re = re.compile('</?a[^>]*>')
51
52	class PageNotFound(Exception):
53	pass
54
55	class RequestTimeout(Exception):
56	pass
57
58	class InvalidResponse(Exception):
59	pass
60
61	class EmptyHTML(Exception):
62	pass
63
64	class EmptyJusText(Exception):
65	pass
66
67	LATEST = 'https://dumps.wikimedia.org/%swiki/latest/%swiki-latest-all-titles-in-ns0.gz'
68	WIKI_URL = 'https://%s.wikipedia.org/wiki/'
69	API_HTML = 'https://%s.wikipedia.org/w/api.php?action=parse&page=%s&format=json'
70	API_JSON = 'https://%s.wikipedia.org/w/api.php?action=query&prop=revisions&titles=%s&format=json'
71
72	# TODO: look at https://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=json&titles=xxx
73
74	JUSTEXT_PARAMS_BY_LEVEL = {
75	'verystrict': { #Justext default
76	'length_low': 70,
77	'length_high': 200,
78	'stopwords_low': 0.3,
79	'stopwords_high': 0.32,
80	'max_link_density': 0.2,
81	'max_good_distance': 5,
82	'max_heading_distance': 150,
83	},
84	'strict': { #recommended
85	'length_low': 70,
86	'length_high': 200,
87	'stopwords_low': 0.25,
88	'stopwords_high': 0.32,
89	'max_link_density': 0.3,
90	'max_good_distance': 5,
91	'max_heading_distance': 150,
92	},
93	'balanced': {
94	'length_low': 55,
95	'length_high': 140,
96	'stopwords_low': 0.2,
97	'stopwords_high': 0.3,
98	'max_link_density': 0.4,
99	'max_good_distance': 5,
100	'max_heading_distance': 200,
101	},
102	'permissive': {
103	'length_low': 40,
104	'length_high': 90,
105	'stopwords_low': 0.2,
106	'stopwords_high': 0.3,
107	'max_link_density': 0.45,
108	'max_good_distance': 10,
109	'max_heading_distance': 300,
110	},
111	}
112
113	def html2prevert(s, justext_wordlist, justext_level='strict', allowshort=False):
114	# TODO: preclean HTML (remove tables, infoboxes, References, TOC, ...)
115	try:
116	html_root = justext.preprocess(html_text=s, encoding='utf-8')
117	paragraphs = justext.make_paragraphs(html_root)
118	except (ParserError, XMLSyntaxError):
119	return ('', 0, 0)
120	#use Justext to classify paragraphs
121	j_length_low = JUSTEXT_PARAMS_BY_LEVEL[justext_level]['length_low']
122	j_length_high = JUSTEXT_PARAMS_BY_LEVEL[justext_level]['length_high']
123	j_max_heading_distance = JUSTEXT_PARAMS_BY_LEVEL[justext_level]['max_heading_distance']
124	if allowshort:
125	j_length_low = j_length_low / 3
126	j_length_high = j_length_high / 3
127	j_max_heading_distance = j_max_heading_distance / 3
128	justext.classify_paragraphs(
129	paragraphs=paragraphs,
130	stoplist=justext_wordlist,
131	length_low=j_length_low, #character count < length_low => bad or short
132	length_high=j_length_high, #character count > length_high => good
133	stopwords_low=JUSTEXT_PARAMS_BY_LEVEL[justext_level]['stopwords_low'], #number of words frequent in the language >= stopwords_low => neargood
134	stopwords_high=JUSTEXT_PARAMS_BY_LEVEL[justext_level]['stopwords_high'], #number of words frequent in the language >= stopwords_high => good or neargood
135	max_link_density=JUSTEXT_PARAMS_BY_LEVEL[justext_level]['max_link_density'] #density of link words (words inside the <a> tag) > max_link_density => bad
136	)
137	justext.revise_paragraph_classification(
138	paragraphs=paragraphs,
139	max_heading_distance=j_max_heading_distance #Short/near-good heads in the distance [chars] before a good par => good
140	)
141	#extract good paragraphs
142	prevert_paragraphs, paragraph_count, plaintext_len = [], 0, 0
143	for p in paragraphs:
144	#if p['class'] == 'good': # TODO find why this does not produce a good result
145	if p['cfclass'] in ('good', 'neargood'): #'good', 'neargood', 'short', 'bad'
146	p_text = justext.html_escape(p['text']).strip()
147	if p_text:
148	paragraph_count += 1
149	plaintext_len += len(p_text)
150	heading = ' heading="yes"' if p['heading'] else ''
151	prevert_paragraphs.append('<p%s>\n%s\n</p>' % (heading, p_text))
152	return ('\n'.join(prevert_paragraphs), paragraph_count, plaintext_len)
153
154	def api_wait(last, wait_interval):
155	n = datetime.datetime.now()
156	interval = (n-last).seconds + ((n-last).microseconds / 1.0e6)
157	if interval < wait_interval:
158	time.sleep(wait_interval - interval)
159
160	def process_page(langcode, title, linksf, raw_response_fp, justext_wordlist, logf,
161	last_api_parse, wait_interval, justext_level, allowshort):
162	api_wait(last_api_parse, wait_interval)
163	api_url = API_HTML % (langcode, url_quote(title))
164
165	try:
166	response_data = urllib.request.urlopen(api_url, timeout=10).read()
167	except HTTPError as e:
168	raise PageNotFound()
169	except URLError as e:
170	if isinstance(e.reason, socket.timeout):
171	raise RequestTimeout()
172	else:
173	raise PageNotFound()
174	except socket.timeout as e:
175	raise RequestTimeout()
176	parse_time = datetime.datetime.now()
177	try:
178	response_data = response_data.decode('utf-8', errors='strict')
179	except UnicodeDecodeError:
180	logf.write('\tignoring a UnicodeDecodeError\n')
181	response_data = response_data.decode('utf-8', errors='ignore')
182	data = json.loads(response_data)
183	if not data or 'error' in data:
184	raise InvalidResponse()
185	#store the API response to allow re-processing without downloading in the future
186	raw_response_fp.write('%s\t%d\n' % (api_url, len(response_data) + 1))
187	raw_response_fp.write(response_data)
188	raw_response_fp.write('\n')
189	#parse the API response
190	p = data['parse']
191	html = p['text']['*'].strip()
192	if html:
193	#remove <a/> tags (Justext makes extra spaces there) # TODO: correct Justext and remove
194	html = remove_links_re.sub('', html)
195	prevert, paragraph_count, plaintext_len = html2prevert(
196	html.encode('utf-8'), justext_wordlist, justext_level, allowshort) # justext decodes!
197	else:
198	raise EmptyHTML()
199	if not prevert:
200	raise EmptyJusText()
201	revid = p['revid']
202	langlinks_len = len(p['langlinks'])
203	categories = '\|'.join([d['*'].replace('"', '') for d in p['categories']])
204	if linksf and p['externallinks']:
205	linksf.write('### %s\n' % title)
206	linksf.write('%s\n' % '\n'.join(p['externallinks']))
207	logf.write('\t%d chars\n' % plaintext_len)
208	page_attrs = 'url="%s" title="%s" wiki_categories="%s" wiki_translations="%d" ' \
209	'paragraphs="%d" chars="%d" crawl_date="%s"' % \
210	((WIKI_URL % langcode) + title, title, categories,
211	langlinks_len, paragraph_count, plaintext_len,
212	parse_time.strftime('%Y-%m-%d %H:%M'))
213	page = '<doc %s>\n%s\n</doc>\n' % (page_attrs, prevert)
214	return (page, paragraph_count, revid, parse_time)
215
216	def go_page(langcode, title, linksf, raw_response_fp, newest, justext_wordlist, logf, last_api_request,
217	last_api_parse, wait_interval, justext_level, allowshort, cache, cf, hits_by_type):
218	page, parlen, revid = '', 0, 0
219	#check the cache first
220	if title in cache:
221	#download the newest revision if there is an old version in the cache
222	if newest:
223	previous_revid = cache[title]
224	api_wait(last_api_request, wait_interval)
225	last_api_request = datetime.datetime.now()
226	api_url = API_JSON % (langcode, url_quote(title))
227	resp = urllib.request.urlopen(api_url)
228	data = json.load(resp)
229	dqp = data['query']['pages']
230	for key in dqp.keys():
231	try:
232	current_revid = dqp[key]['revisions'][0]['revid']
233	except (KeyError, IndexError):
234	logf.write('\tusing old revision %s instead of the newest '
235	'revision (invalid Wiki API response data)\n' % previous_revid)
236	current_revid = previous_revid
237	#skip if cached is already newest
238	if current_revid == previous_revid:
239	hits_by_type['skipped'] += 1
240	logf.write('\tskipping cached\n')
241	return (page, parlen, revid, last_api_parse)
242	#continue to download the page otherwise
243	#skip because in cache
244	else:
245	hits_by_type['skipped'] += 1
246	logf.write('\tskipping already downloaded\n')
247	return (page, parlen, revid, last_api_parse)
248	#download the page since it is not in the cache or there is a new version
249	try:
250	page, parlen, revid, last_api_parse =\
251	process_page(langcode, title, linksf,
252	raw_response_fp, justext_wordlist, logf,
253	last_api_parse, wait_interval,
254	justext_level, allowshort)
255	hits_by_type['processed'] += 1
256	logf.write('\t%d paragraphs\n' % parlen)
257	except Exception as e: #PageNotFound, RequestTimeout, InvalidResponse, EmptyHTML, EmptyJusText
258	page = ''
259	hits_by_type['empty'] += 1
260	log_msg = {
261	'PageNotFound': '\tskipped -- page not found',
262	'RequestTimeout': '\tskipped -- request timeout',
263	'InvalidResponse': '\tskipped -- invalid response',
264	'EmptyHTML': '\tempty HTML parse returned by API',
265	'EmptyJusText': '\tempty prevert returned by jusText'
266	}.get(type(e).__name__, 'tskipped or empty -- %s' % type(e).__name__)
267	logf.write('%s\n' % log_msg)
268	#update the cache (previous records for the same tile are replaced when reloading the cache)
269	if newest:
270	cache[title] = revid #zero if page not found/empty/exception
271	else:
272	cache.add(title)
273	cf.write('%s\t%s\n' % (title, revid))
274	return (page, parlen, revid, last_api_parse)
275
276	def main(langcode, cachefn, raw_response_path, newest, links, justext_wordlist,
277	logf, wait_interval, nicetitles, talkpages, justext_level, allowshort,
278	title_file_path=''):
279	last_api_request = datetime.datetime.now()
280	last_api_parse = datetime.datetime.now()
281
282	if len(justext_wordlist) == 0:
283	logf.write('Wordlist file is empty, switching off stopwords detection.\n')
284	JUSTEXT_PARAMS_BY_LEVEL[justext_level]['stopwords_low'] = 0
285	JUSTEXT_PARAMS_BY_LEVEL[justext_level]['stopwords_high'] = 0
286
287	linksf = open(links, 'a') if links else None
288	cache = {} if newest else set()
289	if os.path.exists(cachefn):
290	logf.write('Cache: %s\n' % cachefn)
291	with open(cachefn) as cf:
292	for line in cf:
293	try:
294	title, revid = line.split('\t')
295	if newest:
296	cache[title.strip()] = revid.strip()
297	else:
298	cache.add(title.strip())
299	except ValueError:
300	continue
301	logf.write('Cache: %d titles loaded\n' % len(cache))
302
303	langcode2 = langcode.replace('-', '_')
304	wikidump_titles_url = LATEST % (langcode2, langcode2)
305	if not title_file_path:
306	title_file_path = wikidump_titles_url.rsplit('/', 1)[-1].replace('.gz', '')
307	if not os.path.exists(title_file_path):
308	logf.write('Getting all titles from latest Wikipedia dump %s to %s\n' %
309	(wikidump_titles_url, title_file_path))
310	wiki_title_data = urllib.request.urlopen(wikidump_titles_url).read()
311	if wikidump_titles_url.endswith('.gz'):
312	from io import BytesIO
313	from gzip import GzipFile
314	bio = BytesIO(wiki_title_data)
315	bio.seek(0)
316	wiki_title_data = GzipFile(fileobj=bio).read()
317	with open(title_file_path, 'wb') as title_file:
318	title_file.write(wiki_title_data)
319
320	cf = open(cachefn, 'at') # cache file
321	raw_response_fp = open(raw_response_path, 'at') #raw API responses
322	hits_by_type = {'processed': 0, 'skipped': 0, 'empty': 0}
323	for line in open(title_file_path, 'rt', encoding='utf-8', errors='ignore'):
324	title = line.strip().replace('"', "'")
325	# TODO: filter titles, use RE as parameter
326	logf.write('%s\n' % title)
327	if not title or nicetitles and not unicode_char_category(title[0])[0] in ('L', 'N'):
328	logf.write('\tskipping (not a nice title)\n')
329	continue
330	page_titles = (title, 'Talk:' + title) if talkpages else (title,)
331	for page_title in page_titles:
332	page, parlen, revid, last_api_parse =\
333	go_page(langcode, page_title, linksf,
334	raw_response_fp, newest, justext_wordlist, logf,
335	last_api_request, last_api_parse, wait_interval,
336	justext_level, allowshort, cache, cf, hits_by_type)
337	if page:
338	sys.stdout.write(page)
339	logf.write('Updated cache database stored in %s\n' % cachefn)
340	if linksf:
341	linksf.close()
342	cf.close()
343	for hit_type, hit_count in hits_by_type.items():
344	logf.write('%s: %d\n' % (hit_type.title(), hit_count))
345
346	if __name__ == '__main__':
347	parser = argparse.ArgumentParser(description='Wikipedia downloader')
348	parser.add_argument('langcode', help='Wikipedia language prefix', type=str)
349	parser.add_argument('wordlist', help='Path to a list of ~2000 most frequent words in the language (UTF-8, one per line)', type=str)
350	parser.add_argument('--cache', help='File with previously downloaded pages and data', type=str, default='')
351	parser.add_argument('--wait', help='Time interval between GET requests', type=float, default=1.0)
352	parser.add_argument('--newest', help='Download the newest versions of articles', action='store_true')
353	parser.add_argument('--links', help='Gather external links from Wikipedia', type=str, default='')
354	parser.add_argument('--nicetitles', help='Download only titles starting with an alphabetical or numerical character', action='store_true')
355	parser.add_argument('--talkpages', help='Download talk pages', action='store_true')
356	parser.add_argument('--cleaning', help='Level of Justext boilerplate & short paragraph removal strictness (default = strict)',
357	type=str, choices=('verystrict', 'strict', 'balanced', 'permissive'), default='strict')
358	parser.add_argument('--allowshort', help='Allow three times shorter texts. Useful for ideographic scripts.', action='store_true')
359	parser.add_argument('--title-file', help='Path to a custom list of titles to download, one per line.', type=str, default='')
360	args = parser.parse_args()
361	cachefile = args.cache or args.langcode + 'wiki.cache'
362	raw_response_file = (args.cache or args.langcode) + '_raw_data'
363	with open(args.wordlist) as fp:
364	justext_wordlist = set([line.rstrip() for line in fp])
365	logfn = args.langcode.replace('/','') + '_' + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '.log'
366	with open(logfn, 'w', buffering=1) as logfile:
367	main(args.langcode, cachefile, raw_response_file, args.newest, args.links,
368	justext_wordlist, logfile, args.wait, args.nicetitles, args.talkpages,
369	args.cleaning, args.allowshort, args.title_file)

Download in other formats:

Original Format