Downloads: wiki2corpus-1.2.0.py

File wiki2corpus-1.2.0.py, 14.1 KB (added by admin, 6 years ago)
Line 
1#!/usr/bin/python
2#coding=utf-8
3
4"""
5MIT License
6
7Copyright (c) 2017 Vit Baisa, Vit Suchomel, Marek Blahus
8
9Permission is hereby granted, free of charge, to any person obtaining a copy
10of this software and associated documentation files (the "Software"), to deal
11in the Software without restriction, including without limitation the rights
12to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13copies of the Software, and to permit persons to whom the Software is
14furnished to do so, subject to the following conditions:
15
16The above copyright notice and this permission notice shall be included in all
17copies or substantial portions of the Software.
18
19THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25SOFTWARE.
26"""
27
28"""
29MediaWiki API help:
30https://www.mediawiki.org/w/api.php?action=help&modules=query
31"""
32
33VERSION = '1.2.0'
34
35import re
36import os
37import sys
38import argparse
39# use requests
40import urllib2
41import urllib
42import httplib # httplib.HTTPException
43import datetime
44import json
45import time
46import gzip
47from justext import core as justext
48from lxml.etree import XMLSyntaxError, ParserError
49from unicodedata import category as unicode_char_category
50
51remove_links_re = re.compile(u'</?a[^>]*>')
52
53class MissingPage(Exception):
54 pass
55
56class EmptyHTML(Exception):
57 pass
58
59class EmptyJusText(Exception):
60 pass
61
62LATEST = 'https://dumps.wikimedia.org/%swiki/latest/%swiki-latest-all-titles-in-ns0.gz'
63WIKI_URL = 'https://%s.wikipedia.org/wiki/'
64API_HTML = 'https://%s.wikipedia.org/w/api.php?action=parse&page=%s&format=json'
65API_JSON = 'https://%s.wikipedia.org/w/api.php?action=query&prop=revisions&titles=%s&format=json'
66
67# TODO: look at https://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=json&titles=xxx
68
69JUSTEXT_PARAMS_BY_LEVEL = {
70 'verystrict': { #Justext default
71 'length_low': 70,
72 'length_high': 200,
73 'stopwords_low': 0.3,
74 'stopwords_high': 0.32,
75 'max_link_density': 0.2,
76 'max_good_distance': 5,
77 'max_heading_distance': 150,
78 },
79 'strict': { #recommended
80 'length_low': 70,
81 'length_high': 200,
82 'stopwords_low': 0.25,
83 'stopwords_high': 0.32,
84 'max_link_density': 0.3,
85 'max_good_distance': 5,
86 'max_heading_distance': 150,
87 },
88 'balanced': {
89 'length_low': 55,
90 'length_high': 140,
91 'stopwords_low': 0.2,
92 'stopwords_high': 0.3,
93 'max_link_density': 0.4,
94 'max_good_distance': 5,
95 'max_heading_distance': 200,
96 },
97 'permissive': {
98 'length_low': 40,
99 'length_high': 90,
100 'stopwords_low': 0.2,
101 'stopwords_high': 0.3,
102 'max_link_density': 0.45,
103 'max_good_distance': 10,
104 'max_heading_distance': 300,
105 },
106}
107
108def html2prevert(s, justext_wordlist, justext_level='strict', allowshort=False):
109 # TODO: preclean HTML (remove tables, infoboxes, References, TOC, ...)
110 try:
111 html_root = justext.preprocess(html_text=s, encoding='utf-8')
112 paragraphs = justext.make_paragraphs(html_root)
113 except (ParserError, XMLSyntaxError):
114 return ('', 0, 0)
115 #use Justext to classify paragraphs
116 j_length_low = JUSTEXT_PARAMS_BY_LEVEL[justext_level]['length_low']
117 j_length_high = JUSTEXT_PARAMS_BY_LEVEL[justext_level]['length_high']
118 j_max_heading_distance = JUSTEXT_PARAMS_BY_LEVEL[justext_level]['max_heading_distance']
119 if allowshort:
120 j_length_low = j_length_low / 3
121 j_length_high = j_length_high / 3
122 j_max_heading_distance = j_max_heading_distance / 3
123 justext.classify_paragraphs(
124 paragraphs=paragraphs,
125 stoplist=justext_wordlist,
126 length_low=j_length_low, #character count < length_low => bad or short
127 length_high=j_length_high, #character count > length_high => good
128 stopwords_low=JUSTEXT_PARAMS_BY_LEVEL[justext_level]['stopwords_low'], #number of words frequent in the language >= stopwords_low => neargood
129 stopwords_high=JUSTEXT_PARAMS_BY_LEVEL[justext_level]['stopwords_high'], #number of words frequent in the language >= stopwords_high => good or neargood
130 max_link_density=JUSTEXT_PARAMS_BY_LEVEL[justext_level]['max_link_density'] #density of link words (words inside the <a> tag) > max_link_density => bad
131 )
132 justext.revise_paragraph_classification(
133 paragraphs=paragraphs,
134 max_heading_distance=j_max_heading_distance #Short/near-good heads in the distance [chars] before a good par => good
135 )
136 #extract good paragraphs
137 prevert_paragraphs, paragraph_count, plaintext_len = [], 0, 0
138 for p in paragraphs:
139 #if p['class'] == 'good': # TODO find why this does not produce a good result
140 if p['cfclass'] in ('good', 'neargood'): #'good', 'neargood', 'short', 'bad'
141 p_text = justext.html_escape(p['text']).strip()
142 if p_text:
143 paragraph_count += 1
144 plaintext_len += len(p_text)
145 heading = u' heading="1"' if p['heading'] else u''
146 prevert_paragraphs.append(u'<p%s>\n%s\n</p>' % (heading, p_text))
147 return (u'\n'.join(prevert_paragraphs), paragraph_count, plaintext_len)
148
149def api_wait(last, wait_interval):
150 n = datetime.datetime.now()
151 interval = (n-last).seconds + ((n-last).microseconds / 1.0e6)
152 if interval < wait_interval:
153 time.sleep(wait_interval - interval)
154
155def process_page(langcode, title, linksf, raw_response_fp, justext_wordlist, logf,
156 last_api_parse, wait_interval, justext_level, allowshort):
157 api_wait(last_api_parse, wait_interval)
158 api_url = API_HTML % (langcode, title)
159 try:
160 response_data = urllib2.urlopen(api_url).read()
161 parse_time = datetime.datetime.now()
162 except (IOError, httplib.HTTPException):
163 # IOError includes both urllib2.URLError and socket.error (Python >= 2.6 for the latter)
164 raise MissingPage()
165 data = json.loads(response_data)
166 if not data or 'error' in data:
167 raise MissingPage()
168 #store the API response to allow re-processing without downloading in the future
169 raw_response_fp.write('%s\t%d\n' % (api_url, len(response_data) + 1))
170 raw_response_fp.write(response_data)
171 raw_response_fp.write('\n')
172 #parse the API response
173 p = data['parse']
174 html = p['text']['*'].strip()
175 if html:
176 #remove <a/> tags (Justext makes extra spaces there) # TODO: correct Justext and remove
177 html = remove_links_re.sub('', html)
178 prevert, paragraph_count, plaintext_len = html2prevert(
179 html.encode('utf-8'), justext_wordlist, justext_level, allowshort) # justext decodes!
180 else:
181 raise EmptyHTML()
182 if not prevert:
183 raise EmptyJusText()
184 revid = p['revid']
185 langlinks_len = len(p['langlinks'])
186 categories = '|'.join([d['*'].replace('"', '') for d in p['categories']])
187 if linksf and p['externallinks']:
188 linksf.write('### %s\n' % title)
189 for line in p['externallinks']:
190 linksf.write(line.encode('utf-8') + '\n')
191 linksf.write('\n')
192 print >>logf, '\t%d chars' % plaintext_len
193 page_attrs = 'url="%s" title="%s" categories="%s" translations="%d" paragraphs="%d" chars="%d" downloaded="%s"' % \
194 ((WIKI_URL % langcode) + title, title, categories.encode('utf-8'),
195 langlinks_len, paragraph_count, plaintext_len,
196 parse_time.strftime('%Y-%m-%d %H:%M'))
197 page = '<doc %s>\n%s\n</doc>\n' % (page_attrs, prevert.encode('utf-8'))
198 return (page, paragraph_count, revid, parse_time)
199
200def go_page(langcode, title, linksf, raw_response_fp, newest, justext_wordlist, logf, last_api_request,
201 last_api_parse, wait_interval, justext_level, allowshort, cache, cf, hits_by_type):
202 page, parlen, revid = '', 0, 0
203 if title in cache: # if page has been found in cache
204 if newest: # prepare to download the newest revision
205 previous_revid = cache[title]
206 api_wait(last_api_request, wait_interval)
207 last_api_request = datetime.datetime.now()
208 resp = urllib2.urlopen(API_JSON % (langcode, title))
209 data = json.load(resp)
210 dqp = data['query']['pages']
211 for key in dqp.keys():
212 try:
213 current_revid = dqp[key]['revisions'][0]['revid']
214 except (KeyError, IndexError):
215 print >>logf, '\tusing old revision %s instead of the newest ' \
216 'revision (invalid Wiki API response data)' % previous_revid
217 current_revid = previous_revid
218 if current_revid == previous_revid: # skip if cached is already newest
219 hits_by_type['skipped'] += 1
220 print >>logf, '\tskipping cached'
221 return (page, parlen, revid, last_api_parse)
222 else: # skip because in cache
223 hits_by_type['skipped'] += 1
224 print >>logf, '\tskipping already downloaded'
225 return (page, parlen, revid, last_api_parse)
226 # download the page
227 try:
228 page, parlen, revid, last_api_parse =\
229 process_page(langcode, title, linksf,
230 raw_response_fp, justext_wordlist, logf,
231 last_api_parse, wait_interval,
232 justext_level, allowshort)
233 cache[title] = revid
234 cf.write('%s\t%s\n' % (title, revid))
235 hits_by_type['processed'] += 1
236 print >>logf, '\t%d paragraphs' % parlen
237 except (MissingPage, EmptyHTML, EmptyJusText) as e:
238 page = ''
239 hits_by_type['empty'] += 1
240 print >>logf, {
241 'MissingPage': '\tempty because not found',
242 'EmptyHTML': '\tempty HTML parse returned by API',
243 'EmptyJusText': '\tempty prevert returned by jusText'} \
244 [type(e).__name__]
245 return (page, parlen, revid, last_api_parse)
246
247def main(langcode, cachefn, raw_response_path, newest, links, justext_wordlist,
248 logf, wait_interval, nicetitles, talkpages, justext_level, allowshort):
249 last_api_request = datetime.datetime.now()
250 last_api_parse = datetime.datetime.now()
251
252 linksf = open(links, 'a') if links else None
253 cache = {}
254 if os.path.exists(cachefn):
255 print >>logf, 'Cache: %s' % cachefn
256 with open(cachefn) as cf:
257 for line in cf:
258 try:
259 title, revid = line.split('\t')
260 cache[title.strip()] = revid.strip()
261 except ValueError:
262 continue
263 cf = open(cachefn, 'a') # cache file
264 raw_response_fp = open(raw_response_path, 'a') #raw API responses
265 wikidump_titles_path = LATEST % (langcode.replace('-', '_'), langcode.replace('-', '_'))
266 print >>logf, 'Getting all titles from latest Wikipedia dump %s' % wikidump_titles_path
267 hits_by_type = {'processed': 0, 'skipped': 0, 'empty': 0}
268 filename, _ = urllib.urlretrieve(wikidump_titles_path)
269 with gzip.open(filename) as df:
270 for line in df:
271 title = line.strip().replace('"', "'")
272 # TODO: filter titles, use RE as parameter
273 print >>logf, '%s' % title
274 if nicetitles and not unicode_char_category(unicode(title, 'utf-8')[0])[0] == 'L':
275 print >>logf, '\tskipping (not a nice title)', title
276 continue
277 for page_title in filter(None, [title, 'Talk:' + title if talkpages else None]):
278 page, parlen, revid, last_api_parse =\
279 go_page(langcode, page_title, linksf,
280 raw_response_fp, newest, justext_wordlist, logf,
281 last_api_request, last_api_parse, wait_interval,
282 justext_level, allowshort, cache, cf, hits_by_type)
283 if page:
284 sys.stdout.write(page)
285 print >>logf, 'Updated cache database stored in %s' % cachefn
286 if linksf:
287 linksf.close()
288 cf.close()
289 for hit_type, hit_count in hits_by_type.items():
290 print >>logf, '%s: %d' % (hit_type.title(), hit_count)
291
292if __name__ == '__main__':
293 parser = argparse.ArgumentParser(description='Wikipedia downloader')
294 parser.add_argument('langcode', help='Wikipedia language prefix', type=str)
295 parser.add_argument('wordlist', help='Path to a list of ~2000 most frequent words in the language (UTF-8, one per line)', type=str)
296 parser.add_argument('--cache', help='File with previously downloaded pages and data', type=str, default='')
297 parser.add_argument('--wait', help='Time interval between GET requests', type=float, default=1.0)
298 parser.add_argument('--newest', help='Download the newest versions of articles', action='store_true')
299 parser.add_argument('--links', help='Gather external links from Wikipedia', type=str, default='')
300 parser.add_argument('--nicetitles', help='Download only titles starting with alphabetical character', action='store_true')
301 parser.add_argument('--talkpages', help='Download talk pages', action='store_true')
302 parser.add_argument('--cleaning', help='Level of Justext boilerplate & short paragraph removal strictness (default = strict)',
303 type=str, choices=('verystrict', 'strict', 'balanced', 'permissive'), default='strict')
304 parser.add_argument('--allowshort', help='Allow three times shorter texts. Useful for ideographic scripts.', action='store_true')
305 args = parser.parse_args()
306 cachefile = args.cache or args.langcode + 'wiki.cache'
307 raw_response_file = (args.cache or args.langcode) + '_raw_data'
308 with open(args.wordlist) as fp:
309 justext_wordlist = set([line.decode('utf-8').rstrip() for line in fp])
310 logfn = args.langcode.replace('/','') + '_' + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '.log'
311 with open(logfn, 'w', buffering=1) as logfile:
312 main(args.langcode, cachefile, raw_response_file, args.newest, args.links,
313 justext_wordlist, logfile, args.wait, args.nicetitles, args.talkpages,
314 args.cleaning, args.allowshort)