Downloads: wiki2corpus-2.0.py

File wiki2corpus-2.0.py, 16.3 KB (added by admin, 9 months ago)
Line 
1#!/usr/bin/python3
2#coding=utf-8
3
4"""
5MIT License
6
7Copyright (c) 2020 Vit Baisa, Vit Suchomel, Marek Blahus
8
9Permission is hereby granted, free of charge, to any person obtaining a copy
10of this software and associated documentation files (the "Software"), to deal
11in the Software without restriction, including without limitation the rights
12to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13copies of the Software, and to permit persons to whom the Software is
14furnished to do so, subject to the following conditions:
15
16The above copyright notice and this permission notice shall be included in all
17copies or substantial portions of the Software.
18
19THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25SOFTWARE.
26"""
27
28"""
29MediaWiki API help:
30https://www.mediawiki.org/w/api.php?action=help&modules=query
31"""
32
33VERSION = '1.3'
34
35import re
36import os
37import sys
38import argparse
39# use requests
40import urllib.request
41from urllib.parse import quote as url_quote
42import http.client # httplib.HTTPException
43import datetime
44import json
45import time
46from justext import core as justext
47from lxml.etree import XMLSyntaxError, ParserError
48from unicodedata import category as unicode_char_category
49
50remove_links_re = re.compile('</?a[^>]*>')
51
52class PageNotFound(Exception):
53    pass
54
55class RequestTimeout(Exception):
56    pass
57
58class InvalidResponse(Exception):
59    pass
60
61class EmptyHTML(Exception):
62    pass
63
64class EmptyJusText(Exception):
65    pass
66
67LATEST = 'https://dumps.wikimedia.org/%swiki/latest/%swiki-latest-all-titles-in-ns0.gz'
68WIKI_URL = 'https://%s.wikipedia.org/wiki/'
69API_HTML = 'https://%s.wikipedia.org/w/api.php?action=parse&page=%s&format=json'
70API_JSON = 'https://%s.wikipedia.org/w/api.php?action=query&prop=revisions&titles=%s&format=json'
71
72# TODO: look at https://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=json&titles=xxx
73
74JUSTEXT_PARAMS_BY_LEVEL = {
75    'verystrict': { #Justext default
76        'length_low': 70,
77        'length_high': 200,
78        'stopwords_low': 0.3,
79        'stopwords_high': 0.32,
80        'max_link_density': 0.2,
81        'max_good_distance': 5,
82        'max_heading_distance': 150,
83    },
84    'strict': { #recommended
85        'length_low': 70,
86        'length_high': 200,
87        'stopwords_low': 0.25,
88        'stopwords_high': 0.32,
89        'max_link_density': 0.3,
90        'max_good_distance': 5,
91        'max_heading_distance': 150,
92    },
93    'balanced': {
94        'length_low': 55,
95        'length_high': 140,
96        'stopwords_low': 0.2,
97        'stopwords_high': 0.3,
98        'max_link_density': 0.4,
99        'max_good_distance': 5,
100        'max_heading_distance': 200,
101    },
102    'permissive': {
103        'length_low': 40,
104        'length_high': 90,
105        'stopwords_low': 0.2,
106        'stopwords_high': 0.3,
107        'max_link_density': 0.45,
108        'max_good_distance': 10,
109        'max_heading_distance': 300,
110    },
111}
112
113def html2prevert(s, justext_wordlist, justext_level='strict', allowshort=False):
114    # TODO: preclean HTML (remove tables, infoboxes, References, TOC, ...)
115    try:
116        html_root = justext.preprocess(html_text=s, encoding='utf-8')
117        paragraphs = justext.make_paragraphs(html_root)
118    except (ParserError, XMLSyntaxError):
119        return ('', 0, 0)
120    #use Justext to classify paragraphs
121    j_length_low = JUSTEXT_PARAMS_BY_LEVEL[justext_level]['length_low']
122    j_length_high = JUSTEXT_PARAMS_BY_LEVEL[justext_level]['length_high']
123    j_max_heading_distance = JUSTEXT_PARAMS_BY_LEVEL[justext_level]['max_heading_distance']
124    if allowshort:
125        j_length_low = j_length_low / 3
126        j_length_high = j_length_high / 3
127        j_max_heading_distance = j_max_heading_distance / 3
128    justext.classify_paragraphs(
129        paragraphs=paragraphs,
130        stoplist=justext_wordlist,
131        length_low=j_length_low, #character count < length_low => bad or short
132        length_high=j_length_high, #character count > length_high => good
133        stopwords_low=JUSTEXT_PARAMS_BY_LEVEL[justext_level]['stopwords_low'], #number of words frequent in the language >= stopwords_low => neargood
134        stopwords_high=JUSTEXT_PARAMS_BY_LEVEL[justext_level]['stopwords_high'], #number of words frequent in the language >= stopwords_high => good or neargood
135        max_link_density=JUSTEXT_PARAMS_BY_LEVEL[justext_level]['max_link_density'] #density of link words (words inside the <a> tag) > max_link_density => bad
136    )
137    justext.revise_paragraph_classification(
138        paragraphs=paragraphs,
139        max_heading_distance=j_max_heading_distance #Short/near-good heads in the distance [chars] before a good par => good
140    )
141    #extract good paragraphs
142    prevert_paragraphs, paragraph_count, plaintext_len = [], 0, 0
143    for p in paragraphs:
144        #if p['class'] == 'good': # TODO find why this does not produce a good result
145        if p['cfclass'] in ('good', 'neargood'): #'good', 'neargood', 'short', 'bad'
146            p_text = justext.html_escape(p['text']).strip()
147            if p_text:
148                paragraph_count += 1
149                plaintext_len += len(p_text)
150                heading = ' heading="yes"' if p['heading'] else ''
151                prevert_paragraphs.append('<p%s>\n%s\n</p>' % (heading, p_text))
152    return ('\n'.join(prevert_paragraphs), paragraph_count, plaintext_len)
153
154def api_wait(last, wait_interval):
155    n = datetime.datetime.now()
156    interval = (n-last).seconds + ((n-last).microseconds / 1.0e6)
157    if interval < wait_interval:
158        time.sleep(wait_interval - interval)
159
160def process_page(langcode, title, linksf, raw_response_fp, justext_wordlist, logf,
161        last_api_parse, wait_interval, justext_level, allowshort):
162    api_wait(last_api_parse, wait_interval)
163    api_url = API_HTML % (langcode, url_quote(title))
164
165    try:
166        response_data = urllib.request.urlopen(api_url, timeout=10).read()
167    except HTTPError as e:
168        raise PageNotFound()
169    except URLError as e:
170        if isinstance(e.reason, socket.timeout):
171            raise RequestTimeout()
172        else:
173            raise PageNotFound()
174    except socket.timeout as e:
175        raise RequestTimeout()
176    parse_time = datetime.datetime.now()
177    try:
178        response_data = response_data.decode('utf-8', errors='strict')
179    except UnicodeDecodeError:
180        logf.write('\tignoring a UnicodeDecodeError\n')
181        response_data = response_data.decode('utf-8', errors='ignore')
182    data = json.loads(response_data)
183    if not data or 'error' in data:
184        raise InvalidResponse()
185    #store the API response to allow re-processing without downloading in the future
186    raw_response_fp.write('%s\t%d\n' % (api_url, len(response_data) + 1))
187    raw_response_fp.write(response_data)
188    raw_response_fp.write('\n')
189    #parse the API response
190    p = data['parse']
191    html = p['text']['*'].strip()
192    if html:
193        #remove <a/> tags (Justext makes extra spaces there) # TODO: correct Justext and remove
194        html = remove_links_re.sub('', html)
195        prevert, paragraph_count, plaintext_len = html2prevert(
196            html.encode('utf-8'), justext_wordlist, justext_level, allowshort) # justext decodes!
197    else:
198        raise EmptyHTML()
199    if not prevert:
200        raise EmptyJusText()
201    revid = p['revid']
202    langlinks_len = len(p['langlinks'])
203    categories = '|'.join([d['*'].replace('"', '') for d in p['categories']])
204    if linksf and p['externallinks']:
205        linksf.write('### %s\n' % title)
206        linksf.write('%s\n' % '\n'.join(p['externallinks']))
207    logf.write('\t%d chars\n' % plaintext_len)
208    page_attrs = 'url="%s" title="%s" wiki_categories="%s" wiki_translations="%d" ' \
209            'paragraphs="%d" chars="%d" crawl_date="%s"' % \
210            ((WIKI_URL % langcode) + title, title, categories,
211            langlinks_len, paragraph_count, plaintext_len,
212            parse_time.strftime('%Y-%m-%d %H:%M'))
213    page = '<doc %s>\n%s\n</doc>\n' % (page_attrs, prevert)
214    return (page, paragraph_count, revid, parse_time)
215
216def go_page(langcode, title, linksf, raw_response_fp, newest, justext_wordlist, logf, last_api_request,
217        last_api_parse, wait_interval, justext_level, allowshort, cache, cf, hits_by_type):
218    page, parlen, revid = '', 0, 0
219    #check the cache first
220    if title in cache:
221        #download the newest revision if there is an old version in the cache
222        if newest:
223            previous_revid = cache[title]
224            api_wait(last_api_request, wait_interval)
225            last_api_request = datetime.datetime.now()
226            api_url = API_JSON % (langcode, url_quote(title))
227            resp = urllib.request.urlopen(api_url)
228            data = json.load(resp)
229            dqp = data['query']['pages']
230            for key in dqp.keys():
231                try:
232                    current_revid = dqp[key]['revisions'][0]['revid']
233                except (KeyError, IndexError):
234                    logf.write('\tusing old revision %s instead of the newest '
235                        'revision (invalid Wiki API response data)\n' % previous_revid)
236                    current_revid = previous_revid
237                #skip if cached is already newest
238                if current_revid == previous_revid:
239                    hits_by_type['skipped'] += 1
240                    logf.write('\tskipping cached\n')
241                    return (page, parlen, revid, last_api_parse)
242                #continue to download the page otherwise
243        #skip because in cache
244        else:
245            hits_by_type['skipped'] += 1
246            logf.write('\tskipping already downloaded\n')
247            return (page, parlen, revid, last_api_parse)
248    #download the page since it is not in the cache or there is a new version
249    try:
250        page, parlen, revid, last_api_parse =\
251                process_page(langcode, title, linksf,
252                raw_response_fp, justext_wordlist, logf,
253                last_api_parse, wait_interval,
254                justext_level, allowshort)
255        hits_by_type['processed'] += 1
256        logf.write('\t%d paragraphs\n' % parlen)
257    except Exception as e: #PageNotFound, RequestTimeout, InvalidResponse, EmptyHTML, EmptyJusText
258        page = ''
259        hits_by_type['empty'] += 1
260        log_msg = {
261            'PageNotFound': '\tskipped -- page not found',
262            'RequestTimeout': '\tskipped -- request timeout',
263            'InvalidResponse': '\tskipped -- invalid response',
264            'EmptyHTML': '\tempty HTML parse returned by API',
265            'EmptyJusText': '\tempty prevert returned by jusText'
266        }.get(type(e).__name__, 'tskipped or empty -- %s' % type(e).__name__)
267        logf.write('%s\n' % log_msg)
268    #update the cache (previous records for the same tile are replaced when reloading the cache)
269    if newest:
270        cache[title] = revid #zero if page not found/empty/exception
271    else:
272        cache.add(title)
273    cf.write('%s\t%s\n' % (title, revid))
274    return (page, parlen, revid, last_api_parse)
275
276def main(langcode, cachefn, raw_response_path, newest, links, justext_wordlist,
277        logf, wait_interval, nicetitles, talkpages, justext_level, allowshort,
278        title_file_path=''):
279    last_api_request = datetime.datetime.now()
280    last_api_parse   = datetime.datetime.now()
281
282    if len(justext_wordlist) == 0:
283        logf.write('Wordlist file is empty, switching off stopwords detection.\n')
284        JUSTEXT_PARAMS_BY_LEVEL[justext_level]['stopwords_low'] = 0
285        JUSTEXT_PARAMS_BY_LEVEL[justext_level]['stopwords_high'] = 0
286
287    linksf = open(links, 'a') if links else None
288    cache = {} if newest else set()
289    if os.path.exists(cachefn):
290        logf.write('Cache: %s\n' % cachefn)
291        with open(cachefn) as cf:
292            for line in cf:
293                try:
294                    title, revid = line.split('\t')
295                    if newest:
296                        cache[title.strip()] = revid.strip()
297                    else:
298                        cache.add(title.strip())
299                except ValueError:
300                    continue
301        logf.write('Cache: %d titles loaded\n' % len(cache))
302
303    langcode2 = langcode.replace('-', '_')
304    wikidump_titles_url = LATEST % (langcode2, langcode2)
305    if not title_file_path:
306        title_file_path = wikidump_titles_url.rsplit('/', 1)[-1].replace('.gz', '')
307    if not os.path.exists(title_file_path):
308        logf.write('Getting all titles from latest Wikipedia dump %s to %s\n' %
309            (wikidump_titles_url, title_file_path))
310        wiki_title_data = urllib.request.urlopen(wikidump_titles_url).read()
311        if wikidump_titles_url.endswith('.gz'):
312            from io import BytesIO
313            from gzip import GzipFile
314            bio = BytesIO(wiki_title_data)
315            bio.seek(0)
316            wiki_title_data = GzipFile(fileobj=bio).read()
317        with open(title_file_path, 'wb') as title_file:
318            title_file.write(wiki_title_data)
319
320    cf = open(cachefn, 'at') # cache file
321    raw_response_fp = open(raw_response_path, 'at') #raw API responses
322    hits_by_type = {'processed': 0, 'skipped': 0, 'empty': 0}
323    for line in open(title_file_path, 'rt', encoding='utf-8', errors='ignore'):
324        title = line.strip().replace('"', "'")
325        # TODO: filter titles, use RE as parameter
326        logf.write('%s\n' % title)
327        if not title or nicetitles and not unicode_char_category(title[0])[0] in ('L', 'N'):
328            logf.write('\tskipping (not a nice title)\n')
329            continue
330        page_titles = (title, 'Talk:' + title) if talkpages else (title,)
331        for page_title in page_titles:
332            page, parlen, revid, last_api_parse =\
333                    go_page(langcode, page_title, linksf,
334                    raw_response_fp, newest, justext_wordlist, logf,
335                    last_api_request, last_api_parse, wait_interval,
336                    justext_level, allowshort, cache, cf, hits_by_type)
337            if page:
338                sys.stdout.write(page)
339    logf.write('Updated cache database stored in %s\n' % cachefn)
340    if linksf:
341        linksf.close()
342    cf.close()
343    for hit_type, hit_count in hits_by_type.items():
344        logf.write('%s: %d\n' % (hit_type.title(), hit_count))
345
346if __name__ == '__main__':
347    parser = argparse.ArgumentParser(description='Wikipedia downloader')
348    parser.add_argument('langcode', help='Wikipedia language prefix', type=str)
349    parser.add_argument('wordlist', help='Path to a list of ~2000 most frequent words in the language (UTF-8, one per line)', type=str)
350    parser.add_argument('--cache', help='File with previously downloaded pages and data', type=str, default='')
351    parser.add_argument('--wait', help='Time interval between GET requests', type=float, default=1.0)
352    parser.add_argument('--newest', help='Download the newest versions of articles', action='store_true')
353    parser.add_argument('--links', help='Gather external links from Wikipedia', type=str, default='')
354    parser.add_argument('--nicetitles', help='Download only titles starting with an alphabetical or numerical character', action='store_true')
355    parser.add_argument('--talkpages', help='Download talk pages', action='store_true')
356    parser.add_argument('--cleaning', help='Level of Justext boilerplate & short paragraph removal strictness (default = strict)',
357        type=str, choices=('verystrict', 'strict', 'balanced', 'permissive'), default='strict')
358    parser.add_argument('--allowshort', help='Allow three times shorter texts. Useful for ideographic scripts.', action='store_true')
359    parser.add_argument('--title-file', help='Path to a custom list of titles to download, one per line.', type=str, default='')
360    args = parser.parse_args()
361    cachefile = args.cache or args.langcode + 'wiki.cache'
362    raw_response_file = (args.cache or args.langcode) + '_raw_data'
363    with open(args.wordlist) as fp:
364        justext_wordlist = set([line.rstrip() for line in fp])
365    logfn = args.langcode.replace('/','') + '_' + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '.log'
366    with open(logfn, 'w', buffering=1) as logfile:
367        main(args.langcode, cachefile, raw_response_file, args.newest, args.links,
368            justext_wordlist, logfile, args.wait, args.nicetitles, args.talkpages,
369            args.cleaning, args.allowshort, args.title_file)