Downloads: wikidownloader.py

File wikidownloader.py, 7.7 KB (added by admin, 7 years ago)

Line
1	#!/usr/bin/python
2
3	"""
4	MIT License
5
6	Copyright (c) 2016 Vit Baisa
7
8	Permission is hereby granted, free of charge, to any person obtaining a copy
9	of this software and associated documentation files (the "Software"), to deal
10	in the Software without restriction, including without limitation the rights
11	to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12	copies of the Software, and to permit persons to whom the Software is
13	furnished to do so, subject to the following conditions:
14
15	The above copyright notice and this permission notice shall be included in all
16	copies or substantial portions of the Software.
17
18	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19	IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20	FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21	AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22	LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23	OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24	SOFTWARE.
25	"""
26
27	import os
28	import sys
29	import argparse
30	import urllib2
31	import urllib
32	import datetime
33	import json
34	import time
35	import datetime
36	import gzip
37	from justext import core as justext
38	from lxml.etree import XMLSyntaxError, ParserError
39
40	class MissingPage(Exception):
41	pass
42
43	class EmptyHTML(Exception):
44	pass
45
46	class EmptyJusText(Exception):
47	pass
48
49	LATEST = 'https://dumps.wikimedia.org/%swiki/latest/%swiki-latest-all-titles-in-ns0.gz'
50	API_HTML = 'https://%s.wikipedia.org/w/api.php?action=parse&page=%s&format=json'
51	API_JSON = 'https://%s.wikipedia.org/w/api.php?action=query&prop=revisions&titles=%s&format=json'
52
53	last_api_request = datetime.datetime.now()
54	last_api_parse = datetime.datetime.now()
55	logf = None
56
57	def html2prevert(s):
58	try:
59	html_root = justext.preprocess(html_text=s, encoding='utf-8')
60	return justext.make_paragraphs(html_root)
61	except (ParserError, XMLSyntaxError):
62	return []
63
64	def api_wait(last):
65	global wait_interval
66	global logf
67	n = datetime.datetime.now()
68	interval = (n-last).seconds + ((n-last).microseconds / 1.0e6)
69	if interval < wait_interval:
70	time.sleep(wait_interval - interval)
71
72	def process_article(langcode, title, linksf):
73	global last_api_parse
74	global logf
75	api_wait(last_api_parse)
76	last_api_parse = datetime.datetime.now()
77	resp = urllib2.urlopen(API_HTML % (langcode, title))
78	data = json.load(resp)
79	if 'error' in data:
80	raise MissingPage()
81	p = data['parse']
82	html = p['text']['*']
83	if html.strip():
84	pars = html2prevert(html.encode('utf-8')) # justext decodes!
85	else:
86	print >>logf, '\tempty HTML parse returned by API'
87	raise EmptyHTML()
88	if not pars:
89	print >>logf, '\tempty prevert returned by jusText'
90	raise EmptyJusText()
91	outp = []
92	for par in pars:
93	parx = justext.html_escape(par['text'])
94	outp.append(parx)
95	revid = p['revid']
96	langlinks_len = len(p['langlinks'])
97	#links = '\v'.join([d['*'].replace('"', '') for d in p['links']])
98	categories = '\v'.join([d['*'].replace('"', '') for d in p['categories']])
99	if linksf and p['externallinks']:
100	linksf.write('### %s\n' % title)
101	for line in p['externallinks']:
102	linksf.write(line + '\n')
103	linksf.write('\n')
104	s = ''
105	chars = 0
106	for p in outp:
107	s += '<p>\n'
108	s += p
109	s += '\n</p>\n'
110	chars += len(p)
111	print >>logf, '\t%d chars' % chars
112	header = '<doc title="%s" categories="%s" translations="%d" paragraphs="%d" chars="%d">\n' %\
113	(title, categories.encode('utf-8'), langlinks_len,
114	#links.encode('utf-8'),
115	len(outp), chars)
116	return header + s.encode('utf-8') + '</doc>\n', len(outp), revid
117
118	def main(langcode, cachefn, logfn, newest, links):
119	global logf
120	logf = open(logfn, 'w')
121	print >>sys.stderr, "Log will be stored in %s" % logfn
122	linksf = open(links, 'w') if links else None
123	cache = {}
124	if os.path.exists(cachefn):
125	print >>logf, 'Cache: %s' % cachefn
126	with open(cachefn) as cf:
127	for line in cf:
128	try:
129	title, revid = line.split('\t')
130	cache[title.strip()] = revid.strip()
131	except ValueError:
132	continue
133	cf = open(cachefn, 'w') # empty cache file
134	print >>logf, 'Getting all titles from latest Wikipedia dump'
135	processed_articles = 0
136	skipped_articles = 0
137	empty_articles = 0
138	filename, _ = urllib.urlretrieve(LATEST % (langcode, langcode))
139	with gzip.open(filename) as df:
140	for line in df:
141	title = line.strip().replace('"', "'")
142	print >>logf, '%s' % title
143	if title in cache:
144	if newest: # download the newest revision
145	previous_revid = cache[title]
146	api_wait(last_api_request)
147	last_api_request = datetime.datetime.now()
148	resp = urllib2.urlopen(API_JSON % (langcode, title))
149	data = json.load(response)
150	try:
151	dqp = data['query']['pages']
152	for key in dqp.keys():
153	current_revid = dqp[key]['revisions']['revid']
154	if previous_revid != current_revid:
155	article, parlen, revid = process_article(langcode, title, linksf)
156	cache[title] = revid
157	cf.write(title + '\t' + revid + '\n')
158	else:
159	print >>logf, '\tskipping cached'
160	except (MissingPage, EmptyHTML, EmptyJusText):
161	article = ''
162	empty_articles += 1
163	else:
164	# do not download
165	print >>logf, '\tskip already downloaded'
166	skipped_articles += 1
167	article = ''
168	else:
169	try:
170	article, parlen, revid = process_article(langcode, title, linksf)
171	cache[title] = revid
172	cf.write('%s\t%d\n' % (title, revid))
173	print >>logf, '\t%d paragraphs' % parlen
174	processed_articles += 1
175	except (MissingPage, EmptyHTML, EmptyJusText):
176	article = ''
177	empty_articles += 1
178	if article:
179	sys.stdout.write(article)
180	print >>logf, 'Updated cache database stored in %s' % cachefn
181	linksf.close()
182	cf.close()
183	print >>logf, 'Processed: %d' % processed_articles
184	print >>logf, 'Empty: %d' % empty_articles
185	print >>logf, 'Skipped: %d' % skipped_articles
186	logf.close()
187
188	if __name__ == '__main__':
189	parser = argparse.ArgumentParser(description='Wikipedia downloader')
190	parser.add_argument('langcode', type=str, help='Wikipedia language prefix')
191	parser.add_argument('--cache', help='Directory with previously downloaded pages and data', type=str, default='')
192	parser.add_argument('--wait', help='Time interval between GET requests', type=float, default=1.0)
193	parser.add_argument('--newest', help='Download the newest versions of articles', action='store_true')
194	parser.add_argument('--links', help='Gather external links from Wikipedia', type=str, default='')
195	args = parser.parse_args()
196	wait_interval = args.wait
197
198	current_time = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
199	logfile = current_time + '.log'
200	cachefile = args.cache or args.langcode + 'wiki.cache'
201	main(args.langcode, cachefile, logfile, args.newest, args.links)

Download in other formats:

Original Format