1 | #!/usr/bin/python
|
---|
2 | #coding=utf-8
|
---|
3 |
|
---|
4 | """
|
---|
5 | MIT License
|
---|
6 |
|
---|
7 | Copyright (c) 2017 Vit Baisa, Vit Suchomel, Marek Blahus
|
---|
8 |
|
---|
9 | Permission is hereby granted, free of charge, to any person obtaining a copy
|
---|
10 | of this software and associated documentation files (the "Software"), to deal
|
---|
11 | in the Software without restriction, including without limitation the rights
|
---|
12 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
---|
13 | copies of the Software, and to permit persons to whom the Software is
|
---|
14 | furnished to do so, subject to the following conditions:
|
---|
15 |
|
---|
16 | The above copyright notice and this permission notice shall be included in all
|
---|
17 | copies or substantial portions of the Software.
|
---|
18 |
|
---|
19 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
---|
20 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
---|
21 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
---|
22 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
---|
23 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
---|
24 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
---|
25 | SOFTWARE.
|
---|
26 | """
|
---|
27 |
|
---|
28 | """
|
---|
29 | MediaWiki API help:
|
---|
30 | https://www.mediawiki.org/w/api.php?action=help&modules=query
|
---|
31 | """
|
---|
32 |
|
---|
33 | VERSION = '1.2.0'
|
---|
34 |
|
---|
35 | import re
|
---|
36 | import os
|
---|
37 | import sys
|
---|
38 | import argparse
|
---|
39 | # use requests
|
---|
40 | import urllib2
|
---|
41 | import urllib
|
---|
42 | import httplib # httplib.HTTPException
|
---|
43 | import datetime
|
---|
44 | import json
|
---|
45 | import time
|
---|
46 | import gzip
|
---|
47 | from justext import core as justext
|
---|
48 | from lxml.etree import XMLSyntaxError, ParserError
|
---|
49 | from unicodedata import category as unicode_char_category
|
---|
50 |
|
---|
51 | remove_links_re = re.compile(u'</?a[^>]*>')
|
---|
52 |
|
---|
53 | class MissingPage(Exception):
|
---|
54 | pass
|
---|
55 |
|
---|
56 | class EmptyHTML(Exception):
|
---|
57 | pass
|
---|
58 |
|
---|
59 | class EmptyJusText(Exception):
|
---|
60 | pass
|
---|
61 |
|
---|
62 | LATEST = 'https://dumps.wikimedia.org/%swiki/latest/%swiki-latest-all-titles-in-ns0.gz'
|
---|
63 | WIKI_URL = 'https://%s.wikipedia.org/wiki/'
|
---|
64 | API_HTML = 'https://%s.wikipedia.org/w/api.php?action=parse&page=%s&format=json'
|
---|
65 | API_JSON = 'https://%s.wikipedia.org/w/api.php?action=query&prop=revisions&titles=%s&format=json'
|
---|
66 |
|
---|
67 | # TODO: look at https://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=json&titles=xxx
|
---|
68 |
|
---|
69 | JUSTEXT_PARAMS_BY_LEVEL = {
|
---|
70 | 'verystrict': { #Justext default
|
---|
71 | 'length_low': 70,
|
---|
72 | 'length_high': 200,
|
---|
73 | 'stopwords_low': 0.3,
|
---|
74 | 'stopwords_high': 0.32,
|
---|
75 | 'max_link_density': 0.2,
|
---|
76 | 'max_good_distance': 5,
|
---|
77 | 'max_heading_distance': 150,
|
---|
78 | },
|
---|
79 | 'strict': { #recommended
|
---|
80 | 'length_low': 70,
|
---|
81 | 'length_high': 200,
|
---|
82 | 'stopwords_low': 0.25,
|
---|
83 | 'stopwords_high': 0.32,
|
---|
84 | 'max_link_density': 0.3,
|
---|
85 | 'max_good_distance': 5,
|
---|
86 | 'max_heading_distance': 150,
|
---|
87 | },
|
---|
88 | 'balanced': {
|
---|
89 | 'length_low': 55,
|
---|
90 | 'length_high': 140,
|
---|
91 | 'stopwords_low': 0.2,
|
---|
92 | 'stopwords_high': 0.3,
|
---|
93 | 'max_link_density': 0.4,
|
---|
94 | 'max_good_distance': 5,
|
---|
95 | 'max_heading_distance': 200,
|
---|
96 | },
|
---|
97 | 'permissive': {
|
---|
98 | 'length_low': 40,
|
---|
99 | 'length_high': 90,
|
---|
100 | 'stopwords_low': 0.2,
|
---|
101 | 'stopwords_high': 0.3,
|
---|
102 | 'max_link_density': 0.45,
|
---|
103 | 'max_good_distance': 10,
|
---|
104 | 'max_heading_distance': 300,
|
---|
105 | },
|
---|
106 | }
|
---|
107 |
|
---|
108 | def html2prevert(s, justext_wordlist, justext_level='strict', allowshort=False):
|
---|
109 | # TODO: preclean HTML (remove tables, infoboxes, References, TOC, ...)
|
---|
110 | try:
|
---|
111 | html_root = justext.preprocess(html_text=s, encoding='utf-8')
|
---|
112 | paragraphs = justext.make_paragraphs(html_root)
|
---|
113 | except (ParserError, XMLSyntaxError):
|
---|
114 | return ('', 0, 0)
|
---|
115 | #use Justext to classify paragraphs
|
---|
116 | j_length_low = JUSTEXT_PARAMS_BY_LEVEL[justext_level]['length_low']
|
---|
117 | j_length_high = JUSTEXT_PARAMS_BY_LEVEL[justext_level]['length_high']
|
---|
118 | j_max_heading_distance = JUSTEXT_PARAMS_BY_LEVEL[justext_level]['max_heading_distance']
|
---|
119 | if allowshort:
|
---|
120 | j_length_low = j_length_low / 3
|
---|
121 | j_length_high = j_length_high / 3
|
---|
122 | j_max_heading_distance = j_max_heading_distance / 3
|
---|
123 | justext.classify_paragraphs(
|
---|
124 | paragraphs=paragraphs,
|
---|
125 | stoplist=justext_wordlist,
|
---|
126 | length_low=j_length_low, #character count < length_low => bad or short
|
---|
127 | length_high=j_length_high, #character count > length_high => good
|
---|
128 | stopwords_low=JUSTEXT_PARAMS_BY_LEVEL[justext_level]['stopwords_low'], #number of words frequent in the language >= stopwords_low => neargood
|
---|
129 | stopwords_high=JUSTEXT_PARAMS_BY_LEVEL[justext_level]['stopwords_high'], #number of words frequent in the language >= stopwords_high => good or neargood
|
---|
130 | max_link_density=JUSTEXT_PARAMS_BY_LEVEL[justext_level]['max_link_density'] #density of link words (words inside the <a> tag) > max_link_density => bad
|
---|
131 | )
|
---|
132 | justext.revise_paragraph_classification(
|
---|
133 | paragraphs=paragraphs,
|
---|
134 | max_heading_distance=j_max_heading_distance #Short/near-good heads in the distance [chars] before a good par => good
|
---|
135 | )
|
---|
136 | #extract good paragraphs
|
---|
137 | prevert_paragraphs, paragraph_count, plaintext_len = [], 0, 0
|
---|
138 | for p in paragraphs:
|
---|
139 | #if p['class'] == 'good': # TODO find why this does not produce a good result
|
---|
140 | if p['cfclass'] in ('good', 'neargood'): #'good', 'neargood', 'short', 'bad'
|
---|
141 | p_text = justext.html_escape(p['text']).strip()
|
---|
142 | if p_text:
|
---|
143 | paragraph_count += 1
|
---|
144 | plaintext_len += len(p_text)
|
---|
145 | heading = u' heading="1"' if p['heading'] else u''
|
---|
146 | prevert_paragraphs.append(u'<p%s>\n%s\n</p>' % (heading, p_text))
|
---|
147 | return (u'\n'.join(prevert_paragraphs), paragraph_count, plaintext_len)
|
---|
148 |
|
---|
149 | def api_wait(last, wait_interval):
|
---|
150 | n = datetime.datetime.now()
|
---|
151 | interval = (n-last).seconds + ((n-last).microseconds / 1.0e6)
|
---|
152 | if interval < wait_interval:
|
---|
153 | time.sleep(wait_interval - interval)
|
---|
154 |
|
---|
155 | def process_page(langcode, title, linksf, raw_response_fp, justext_wordlist, logf,
|
---|
156 | last_api_parse, wait_interval, justext_level, allowshort):
|
---|
157 | api_wait(last_api_parse, wait_interval)
|
---|
158 | api_url = API_HTML % (langcode, title)
|
---|
159 | try:
|
---|
160 | response_data = urllib2.urlopen(api_url).read()
|
---|
161 | parse_time = datetime.datetime.now()
|
---|
162 | except (IOError, httplib.HTTPException):
|
---|
163 | # IOError includes both urllib2.URLError and socket.error (Python >= 2.6 for the latter)
|
---|
164 | raise MissingPage()
|
---|
165 | data = json.loads(response_data)
|
---|
166 | if not data or 'error' in data:
|
---|
167 | raise MissingPage()
|
---|
168 | #store the API response to allow re-processing without downloading in the future
|
---|
169 | raw_response_fp.write('%s\t%d\n' % (api_url, len(response_data) + 1))
|
---|
170 | raw_response_fp.write(response_data)
|
---|
171 | raw_response_fp.write('\n')
|
---|
172 | #parse the API response
|
---|
173 | p = data['parse']
|
---|
174 | html = p['text']['*'].strip()
|
---|
175 | if html:
|
---|
176 | #remove <a/> tags (Justext makes extra spaces there) # TODO: correct Justext and remove
|
---|
177 | html = remove_links_re.sub('', html)
|
---|
178 | prevert, paragraph_count, plaintext_len = html2prevert(
|
---|
179 | html.encode('utf-8'), justext_wordlist, justext_level, allowshort) # justext decodes!
|
---|
180 | else:
|
---|
181 | raise EmptyHTML()
|
---|
182 | if not prevert:
|
---|
183 | raise EmptyJusText()
|
---|
184 | revid = p['revid']
|
---|
185 | langlinks_len = len(p['langlinks'])
|
---|
186 | categories = '|'.join([d['*'].replace('"', '') for d in p['categories']])
|
---|
187 | if linksf and p['externallinks']:
|
---|
188 | linksf.write('### %s\n' % title)
|
---|
189 | for line in p['externallinks']:
|
---|
190 | linksf.write(line.encode('utf-8') + '\n')
|
---|
191 | linksf.write('\n')
|
---|
192 | print >>logf, '\t%d chars' % plaintext_len
|
---|
193 | page_attrs = 'url="%s" title="%s" categories="%s" translations="%d" paragraphs="%d" chars="%d" downloaded="%s"' % \
|
---|
194 | ((WIKI_URL % langcode) + title, title, categories.encode('utf-8'),
|
---|
195 | langlinks_len, paragraph_count, plaintext_len,
|
---|
196 | parse_time.strftime('%Y-%m-%d %H:%M'))
|
---|
197 | page = '<doc %s>\n%s\n</doc>\n' % (page_attrs, prevert.encode('utf-8'))
|
---|
198 | return (page, paragraph_count, revid, parse_time)
|
---|
199 |
|
---|
200 | def go_page(langcode, title, linksf, raw_response_fp, newest, justext_wordlist, logf, last_api_request,
|
---|
201 | last_api_parse, wait_interval, justext_level, allowshort, cache, cf, hits_by_type):
|
---|
202 | page, parlen, revid = '', 0, 0
|
---|
203 | if title in cache: # if page has been found in cache
|
---|
204 | if newest: # prepare to download the newest revision
|
---|
205 | previous_revid = cache[title]
|
---|
206 | api_wait(last_api_request, wait_interval)
|
---|
207 | last_api_request = datetime.datetime.now()
|
---|
208 | resp = urllib2.urlopen(API_JSON % (langcode, title))
|
---|
209 | data = json.load(resp)
|
---|
210 | dqp = data['query']['pages']
|
---|
211 | for key in dqp.keys():
|
---|
212 | try:
|
---|
213 | current_revid = dqp[key]['revisions'][0]['revid']
|
---|
214 | except (KeyError, IndexError):
|
---|
215 | print >>logf, '\tusing old revision %s instead of the newest ' \
|
---|
216 | 'revision (invalid Wiki API response data)' % previous_revid
|
---|
217 | current_revid = previous_revid
|
---|
218 | if current_revid == previous_revid: # skip if cached is already newest
|
---|
219 | hits_by_type['skipped'] += 1
|
---|
220 | print >>logf, '\tskipping cached'
|
---|
221 | return (page, parlen, revid, last_api_parse)
|
---|
222 | else: # skip because in cache
|
---|
223 | hits_by_type['skipped'] += 1
|
---|
224 | print >>logf, '\tskipping already downloaded'
|
---|
225 | return (page, parlen, revid, last_api_parse)
|
---|
226 | # download the page
|
---|
227 | try:
|
---|
228 | page, parlen, revid, last_api_parse =\
|
---|
229 | process_page(langcode, title, linksf,
|
---|
230 | raw_response_fp, justext_wordlist, logf,
|
---|
231 | last_api_parse, wait_interval,
|
---|
232 | justext_level, allowshort)
|
---|
233 | cache[title] = revid
|
---|
234 | cf.write('%s\t%s\n' % (title, revid))
|
---|
235 | hits_by_type['processed'] += 1
|
---|
236 | print >>logf, '\t%d paragraphs' % parlen
|
---|
237 | except (MissingPage, EmptyHTML, EmptyJusText) as e:
|
---|
238 | page = ''
|
---|
239 | hits_by_type['empty'] += 1
|
---|
240 | print >>logf, {
|
---|
241 | 'MissingPage': '\tempty because not found',
|
---|
242 | 'EmptyHTML': '\tempty HTML parse returned by API',
|
---|
243 | 'EmptyJusText': '\tempty prevert returned by jusText'} \
|
---|
244 | [type(e).__name__]
|
---|
245 | return (page, parlen, revid, last_api_parse)
|
---|
246 |
|
---|
247 | def main(langcode, cachefn, raw_response_path, newest, links, justext_wordlist,
|
---|
248 | logf, wait_interval, nicetitles, talkpages, justext_level, allowshort):
|
---|
249 | last_api_request = datetime.datetime.now()
|
---|
250 | last_api_parse = datetime.datetime.now()
|
---|
251 |
|
---|
252 | linksf = open(links, 'a') if links else None
|
---|
253 | cache = {}
|
---|
254 | if os.path.exists(cachefn):
|
---|
255 | print >>logf, 'Cache: %s' % cachefn
|
---|
256 | with open(cachefn) as cf:
|
---|
257 | for line in cf:
|
---|
258 | try:
|
---|
259 | title, revid = line.split('\t')
|
---|
260 | cache[title.strip()] = revid.strip()
|
---|
261 | except ValueError:
|
---|
262 | continue
|
---|
263 | cf = open(cachefn, 'a') # cache file
|
---|
264 | raw_response_fp = open(raw_response_path, 'a') #raw API responses
|
---|
265 | wikidump_titles_path = LATEST % (langcode.replace('-', '_'), langcode.replace('-', '_'))
|
---|
266 | print >>logf, 'Getting all titles from latest Wikipedia dump %s' % wikidump_titles_path
|
---|
267 | hits_by_type = {'processed': 0, 'skipped': 0, 'empty': 0}
|
---|
268 | filename, _ = urllib.urlretrieve(wikidump_titles_path)
|
---|
269 | with gzip.open(filename) as df:
|
---|
270 | for line in df:
|
---|
271 | title = line.strip().replace('"', "'")
|
---|
272 | # TODO: filter titles, use RE as parameter
|
---|
273 | print >>logf, '%s' % title
|
---|
274 | if nicetitles and not unicode_char_category(unicode(title, 'utf-8')[0])[0] == 'L':
|
---|
275 | print >>logf, '\tskipping (not a nice title)', title
|
---|
276 | continue
|
---|
277 | for page_title in filter(None, [title, 'Talk:' + title if talkpages else None]):
|
---|
278 | page, parlen, revid, last_api_parse =\
|
---|
279 | go_page(langcode, page_title, linksf,
|
---|
280 | raw_response_fp, newest, justext_wordlist, logf,
|
---|
281 | last_api_request, last_api_parse, wait_interval,
|
---|
282 | justext_level, allowshort, cache, cf, hits_by_type)
|
---|
283 | if page:
|
---|
284 | sys.stdout.write(page)
|
---|
285 | print >>logf, 'Updated cache database stored in %s' % cachefn
|
---|
286 | if linksf:
|
---|
287 | linksf.close()
|
---|
288 | cf.close()
|
---|
289 | for hit_type, hit_count in hits_by_type.items():
|
---|
290 | print >>logf, '%s: %d' % (hit_type.title(), hit_count)
|
---|
291 |
|
---|
292 | if __name__ == '__main__':
|
---|
293 | parser = argparse.ArgumentParser(description='Wikipedia downloader')
|
---|
294 | parser.add_argument('langcode', help='Wikipedia language prefix', type=str)
|
---|
295 | parser.add_argument('wordlist', help='Path to a list of ~2000 most frequent words in the language (UTF-8, one per line)', type=str)
|
---|
296 | parser.add_argument('--cache', help='File with previously downloaded pages and data', type=str, default='')
|
---|
297 | parser.add_argument('--wait', help='Time interval between GET requests', type=float, default=1.0)
|
---|
298 | parser.add_argument('--newest', help='Download the newest versions of articles', action='store_true')
|
---|
299 | parser.add_argument('--links', help='Gather external links from Wikipedia', type=str, default='')
|
---|
300 | parser.add_argument('--nicetitles', help='Download only titles starting with alphabetical character', action='store_true')
|
---|
301 | parser.add_argument('--talkpages', help='Download talk pages', action='store_true')
|
---|
302 | parser.add_argument('--cleaning', help='Level of Justext boilerplate & short paragraph removal strictness (default = strict)',
|
---|
303 | type=str, choices=('verystrict', 'strict', 'balanced', 'permissive'), default='strict')
|
---|
304 | parser.add_argument('--allowshort', help='Allow three times shorter texts. Useful for ideographic scripts.', action='store_true')
|
---|
305 | args = parser.parse_args()
|
---|
306 | cachefile = args.cache or args.langcode + 'wiki.cache'
|
---|
307 | raw_response_file = (args.cache or args.langcode) + '_raw_data'
|
---|
308 | with open(args.wordlist) as fp:
|
---|
309 | justext_wordlist = set([line.decode('utf-8').rstrip() for line in fp])
|
---|
310 | logfn = args.langcode.replace('/','') + '_' + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '.log'
|
---|
311 | with open(logfn, 'w', buffering=1) as logfile:
|
---|
312 | main(args.langcode, cachefile, raw_response_file, args.newest, args.links,
|
---|
313 | justext_wordlist, logfile, args.wait, args.nicetitles, args.talkpages,
|
---|
314 | args.cleaning, args.allowshort)
|
---|