Downloads: wiki2corpus-1.2.0.py

File wiki2corpus-1.2.0.py, 14.1 KB (added by admin, 6 years ago)

Line
1	#!/usr/bin/python
2	#coding=utf-8
3
4	"""
5	MIT License
6
7	Copyright (c) 2017 Vit Baisa, Vit Suchomel, Marek Blahus
8
9	Permission is hereby granted, free of charge, to any person obtaining a copy
10	of this software and associated documentation files (the "Software"), to deal
11	in the Software without restriction, including without limitation the rights
12	to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13	copies of the Software, and to permit persons to whom the Software is
14	furnished to do so, subject to the following conditions:
15
16	The above copyright notice and this permission notice shall be included in all
17	copies or substantial portions of the Software.
18
19	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20	IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21	FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22	AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23	LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24	OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25	SOFTWARE.
26	"""
27
28	"""
29	MediaWiki API help:
30	https://www.mediawiki.org/w/api.php?action=help&modules=query
31	"""
32
33	VERSION = '1.2.0'
34
35	import re
36	import os
37	import sys
38	import argparse
39	# use requests
40	import urllib2
41	import urllib
42	import httplib # httplib.HTTPException
43	import datetime
44	import json
45	import time
46	import gzip
47	from justext import core as justext
48	from lxml.etree import XMLSyntaxError, ParserError
49	from unicodedata import category as unicode_char_category
50
51	remove_links_re = re.compile(u'</?a[^>]*>')
52
53	class MissingPage(Exception):
54	pass
55
56	class EmptyHTML(Exception):
57	pass
58
59	class EmptyJusText(Exception):
60	pass
61
62	LATEST = 'https://dumps.wikimedia.org/%swiki/latest/%swiki-latest-all-titles-in-ns0.gz'
63	WIKI_URL = 'https://%s.wikipedia.org/wiki/'
64	API_HTML = 'https://%s.wikipedia.org/w/api.php?action=parse&page=%s&format=json'
65	API_JSON = 'https://%s.wikipedia.org/w/api.php?action=query&prop=revisions&titles=%s&format=json'
66
67	# TODO: look at https://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=json&titles=xxx
68
69	JUSTEXT_PARAMS_BY_LEVEL = {
70	'verystrict': { #Justext default
71	'length_low': 70,
72	'length_high': 200,
73	'stopwords_low': 0.3,
74	'stopwords_high': 0.32,
75	'max_link_density': 0.2,
76	'max_good_distance': 5,
77	'max_heading_distance': 150,
78	},
79	'strict': { #recommended
80	'length_low': 70,
81	'length_high': 200,
82	'stopwords_low': 0.25,
83	'stopwords_high': 0.32,
84	'max_link_density': 0.3,
85	'max_good_distance': 5,
86	'max_heading_distance': 150,
87	},
88	'balanced': {
89	'length_low': 55,
90	'length_high': 140,
91	'stopwords_low': 0.2,
92	'stopwords_high': 0.3,
93	'max_link_density': 0.4,
94	'max_good_distance': 5,
95	'max_heading_distance': 200,
96	},
97	'permissive': {
98	'length_low': 40,
99	'length_high': 90,
100	'stopwords_low': 0.2,
101	'stopwords_high': 0.3,
102	'max_link_density': 0.45,
103	'max_good_distance': 10,
104	'max_heading_distance': 300,
105	},
106	}
107
108	def html2prevert(s, justext_wordlist, justext_level='strict', allowshort=False):
109	# TODO: preclean HTML (remove tables, infoboxes, References, TOC, ...)
110	try:
111	html_root = justext.preprocess(html_text=s, encoding='utf-8')
112	paragraphs = justext.make_paragraphs(html_root)
113	except (ParserError, XMLSyntaxError):
114	return ('', 0, 0)
115	#use Justext to classify paragraphs
116	j_length_low = JUSTEXT_PARAMS_BY_LEVEL[justext_level]['length_low']
117	j_length_high = JUSTEXT_PARAMS_BY_LEVEL[justext_level]['length_high']
118	j_max_heading_distance = JUSTEXT_PARAMS_BY_LEVEL[justext_level]['max_heading_distance']
119	if allowshort:
120	j_length_low = j_length_low / 3
121	j_length_high = j_length_high / 3
122	j_max_heading_distance = j_max_heading_distance / 3
123	justext.classify_paragraphs(
124	paragraphs=paragraphs,
125	stoplist=justext_wordlist,
126	length_low=j_length_low, #character count < length_low => bad or short
127	length_high=j_length_high, #character count > length_high => good
128	stopwords_low=JUSTEXT_PARAMS_BY_LEVEL[justext_level]['stopwords_low'], #number of words frequent in the language >= stopwords_low => neargood
129	stopwords_high=JUSTEXT_PARAMS_BY_LEVEL[justext_level]['stopwords_high'], #number of words frequent in the language >= stopwords_high => good or neargood
130	max_link_density=JUSTEXT_PARAMS_BY_LEVEL[justext_level]['max_link_density'] #density of link words (words inside the <a> tag) > max_link_density => bad
131	)
132	justext.revise_paragraph_classification(
133	paragraphs=paragraphs,
134	max_heading_distance=j_max_heading_distance #Short/near-good heads in the distance [chars] before a good par => good
135	)
136	#extract good paragraphs
137	prevert_paragraphs, paragraph_count, plaintext_len = [], 0, 0
138	for p in paragraphs:
139	#if p['class'] == 'good': # TODO find why this does not produce a good result
140	if p['cfclass'] in ('good', 'neargood'): #'good', 'neargood', 'short', 'bad'
141	p_text = justext.html_escape(p['text']).strip()
142	if p_text:
143	paragraph_count += 1
144	plaintext_len += len(p_text)
145	heading = u' heading="1"' if p['heading'] else u''
146	prevert_paragraphs.append(u'<p%s>\n%s\n</p>' % (heading, p_text))
147	return (u'\n'.join(prevert_paragraphs), paragraph_count, plaintext_len)
148
149	def api_wait(last, wait_interval):
150	n = datetime.datetime.now()
151	interval = (n-last).seconds + ((n-last).microseconds / 1.0e6)
152	if interval < wait_interval:
153	time.sleep(wait_interval - interval)
154
155	def process_page(langcode, title, linksf, raw_response_fp, justext_wordlist, logf,
156	last_api_parse, wait_interval, justext_level, allowshort):
157	api_wait(last_api_parse, wait_interval)
158	api_url = API_HTML % (langcode, title)
159	try:
160	response_data = urllib2.urlopen(api_url).read()
161	parse_time = datetime.datetime.now()
162	except (IOError, httplib.HTTPException):
163	# IOError includes both urllib2.URLError and socket.error (Python >= 2.6 for the latter)
164	raise MissingPage()
165	data = json.loads(response_data)
166	if not data or 'error' in data:
167	raise MissingPage()
168	#store the API response to allow re-processing without downloading in the future
169	raw_response_fp.write('%s\t%d\n' % (api_url, len(response_data) + 1))
170	raw_response_fp.write(response_data)
171	raw_response_fp.write('\n')
172	#parse the API response
173	p = data['parse']
174	html = p['text']['*'].strip()
175	if html:
176	#remove <a/> tags (Justext makes extra spaces there) # TODO: correct Justext and remove
177	html = remove_links_re.sub('', html)
178	prevert, paragraph_count, plaintext_len = html2prevert(
179	html.encode('utf-8'), justext_wordlist, justext_level, allowshort) # justext decodes!
180	else:
181	raise EmptyHTML()
182	if not prevert:
183	raise EmptyJusText()
184	revid = p['revid']
185	langlinks_len = len(p['langlinks'])
186	categories = '\|'.join([d['*'].replace('"', '') for d in p['categories']])
187	if linksf and p['externallinks']:
188	linksf.write('### %s\n' % title)
189	for line in p['externallinks']:
190	linksf.write(line.encode('utf-8') + '\n')
191	linksf.write('\n')
192	print >>logf, '\t%d chars' % plaintext_len
193	page_attrs = 'url="%s" title="%s" categories="%s" translations="%d" paragraphs="%d" chars="%d" downloaded="%s"' % \
194	((WIKI_URL % langcode) + title, title, categories.encode('utf-8'),
195	langlinks_len, paragraph_count, plaintext_len,
196	parse_time.strftime('%Y-%m-%d %H:%M'))
197	page = '<doc %s>\n%s\n</doc>\n' % (page_attrs, prevert.encode('utf-8'))
198	return (page, paragraph_count, revid, parse_time)
199
200	def go_page(langcode, title, linksf, raw_response_fp, newest, justext_wordlist, logf, last_api_request,
201	last_api_parse, wait_interval, justext_level, allowshort, cache, cf, hits_by_type):
202	page, parlen, revid = '', 0, 0
203	if title in cache: # if page has been found in cache
204	if newest: # prepare to download the newest revision
205	previous_revid = cache[title]
206	api_wait(last_api_request, wait_interval)
207	last_api_request = datetime.datetime.now()
208	resp = urllib2.urlopen(API_JSON % (langcode, title))
209	data = json.load(resp)
210	dqp = data['query']['pages']
211	for key in dqp.keys():
212	try:
213	current_revid = dqp[key]['revisions'][0]['revid']
214	except (KeyError, IndexError):
215	print >>logf, '\tusing old revision %s instead of the newest ' \
216	'revision (invalid Wiki API response data)' % previous_revid
217	current_revid = previous_revid
218	if current_revid == previous_revid: # skip if cached is already newest
219	hits_by_type['skipped'] += 1
220	print >>logf, '\tskipping cached'
221	return (page, parlen, revid, last_api_parse)
222	else: # skip because in cache
223	hits_by_type['skipped'] += 1
224	print >>logf, '\tskipping already downloaded'
225	return (page, parlen, revid, last_api_parse)
226	# download the page
227	try:
228	page, parlen, revid, last_api_parse =\
229	process_page(langcode, title, linksf,
230	raw_response_fp, justext_wordlist, logf,
231	last_api_parse, wait_interval,
232	justext_level, allowshort)
233	cache[title] = revid
234	cf.write('%s\t%s\n' % (title, revid))
235	hits_by_type['processed'] += 1
236	print >>logf, '\t%d paragraphs' % parlen
237	except (MissingPage, EmptyHTML, EmptyJusText) as e:
238	page = ''
239	hits_by_type['empty'] += 1
240	print >>logf, {
241	'MissingPage': '\tempty because not found',
242	'EmptyHTML': '\tempty HTML parse returned by API',
243	'EmptyJusText': '\tempty prevert returned by jusText'} \
244	[type(e).__name__]
245	return (page, parlen, revid, last_api_parse)
246
247	def main(langcode, cachefn, raw_response_path, newest, links, justext_wordlist,
248	logf, wait_interval, nicetitles, talkpages, justext_level, allowshort):
249	last_api_request = datetime.datetime.now()
250	last_api_parse = datetime.datetime.now()
251
252	linksf = open(links, 'a') if links else None
253	cache = {}
254	if os.path.exists(cachefn):
255	print >>logf, 'Cache: %s' % cachefn
256	with open(cachefn) as cf:
257	for line in cf:
258	try:
259	title, revid = line.split('\t')
260	cache[title.strip()] = revid.strip()
261	except ValueError:
262	continue
263	cf = open(cachefn, 'a') # cache file
264	raw_response_fp = open(raw_response_path, 'a') #raw API responses
265	wikidump_titles_path = LATEST % (langcode.replace('-', '_'), langcode.replace('-', '_'))
266	print >>logf, 'Getting all titles from latest Wikipedia dump %s' % wikidump_titles_path
267	hits_by_type = {'processed': 0, 'skipped': 0, 'empty': 0}
268	filename, _ = urllib.urlretrieve(wikidump_titles_path)
269	with gzip.open(filename) as df:
270	for line in df:
271	title = line.strip().replace('"', "'")
272	# TODO: filter titles, use RE as parameter
273	print >>logf, '%s' % title
274	if nicetitles and not unicode_char_category(unicode(title, 'utf-8')[0])[0] == 'L':
275	print >>logf, '\tskipping (not a nice title)', title
276	continue
277	for page_title in filter(None, [title, 'Talk:' + title if talkpages else None]):
278	page, parlen, revid, last_api_parse =\
279	go_page(langcode, page_title, linksf,
280	raw_response_fp, newest, justext_wordlist, logf,
281	last_api_request, last_api_parse, wait_interval,
282	justext_level, allowshort, cache, cf, hits_by_type)
283	if page:
284	sys.stdout.write(page)
285	print >>logf, 'Updated cache database stored in %s' % cachefn
286	if linksf:
287	linksf.close()
288	cf.close()
289	for hit_type, hit_count in hits_by_type.items():
290	print >>logf, '%s: %d' % (hit_type.title(), hit_count)
291
292	if __name__ == '__main__':
293	parser = argparse.ArgumentParser(description='Wikipedia downloader')
294	parser.add_argument('langcode', help='Wikipedia language prefix', type=str)
295	parser.add_argument('wordlist', help='Path to a list of ~2000 most frequent words in the language (UTF-8, one per line)', type=str)
296	parser.add_argument('--cache', help='File with previously downloaded pages and data', type=str, default='')
297	parser.add_argument('--wait', help='Time interval between GET requests', type=float, default=1.0)
298	parser.add_argument('--newest', help='Download the newest versions of articles', action='store_true')
299	parser.add_argument('--links', help='Gather external links from Wikipedia', type=str, default='')
300	parser.add_argument('--nicetitles', help='Download only titles starting with alphabetical character', action='store_true')
301	parser.add_argument('--talkpages', help='Download talk pages', action='store_true')
302	parser.add_argument('--cleaning', help='Level of Justext boilerplate & short paragraph removal strictness (default = strict)',
303	type=str, choices=('verystrict', 'strict', 'balanced', 'permissive'), default='strict')
304	parser.add_argument('--allowshort', help='Allow three times shorter texts. Useful for ideographic scripts.', action='store_true')
305	args = parser.parse_args()
306	cachefile = args.cache or args.langcode + 'wiki.cache'
307	raw_response_file = (args.cache or args.langcode) + '_raw_data'
308	with open(args.wordlist) as fp:
309	justext_wordlist = set([line.decode('utf-8').rstrip() for line in fp])
310	logfn = args.langcode.replace('/','') + '_' + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '.log'
311	with open(logfn, 'w', buffering=1) as logfile:
312	main(args.langcode, cachefile, raw_response_file, args.newest, args.links,
313	justext_wordlist, logfile, args.wait, args.nicetitles, args.talkpages,
314	args.cleaning, args.allowshort)

Download in other formats:

Original Format