Downloads: wiki2corpus-1.2.0.py

File wiki2corpus-1.2.0.py, 14.1 KB (added by admin, 5 months ago)
Line 
1#!/usr/bin/python
2#coding=utf-8
3
4"""
5MIT License
6
7Copyright (c) 2017 Vit Baisa, Vit Suchomel, Marek Blahus
8
9Permission is hereby granted, free of charge, to any person obtaining a copy
10of this software and associated documentation files (the "Software"), to deal
11in the Software without restriction, including without limitation the rights
12to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13copies of the Software, and to permit persons to whom the Software is
14furnished to do so, subject to the following conditions:
15
16The above copyright notice and this permission notice shall be included in all
17copies or substantial portions of the Software.
18
19THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25SOFTWARE.
26"""
27
28"""
29MediaWiki API help:
30https://www.mediawiki.org/w/api.php?action=help&modules=query
31"""
32
33VERSION = '1.2.0'
34
35import re
36import os
37import sys
38import argparse
39# use requests
40import urllib2
41import urllib
42import httplib # httplib.HTTPException
43import datetime
44import json
45import time
46import gzip
47from justext import core as justext
48from lxml.etree import XMLSyntaxError, ParserError
49from unicodedata import category as unicode_char_category
50
51remove_links_re = re.compile(u'</?a[^>]*>')
52
53class MissingPage(Exception):
54    pass
55
56class EmptyHTML(Exception):
57    pass
58
59class EmptyJusText(Exception):
60    pass
61
62LATEST = 'https://dumps.wikimedia.org/%swiki/latest/%swiki-latest-all-titles-in-ns0.gz'
63WIKI_URL = 'https://%s.wikipedia.org/wiki/'
64API_HTML = 'https://%s.wikipedia.org/w/api.php?action=parse&page=%s&format=json'
65API_JSON = 'https://%s.wikipedia.org/w/api.php?action=query&prop=revisions&titles=%s&format=json'
66
67# TODO: look at https://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=json&titles=xxx
68
69JUSTEXT_PARAMS_BY_LEVEL = {
70    'verystrict': { #Justext default
71        'length_low': 70,
72        'length_high': 200,
73        'stopwords_low': 0.3,
74        'stopwords_high': 0.32,
75        'max_link_density': 0.2,
76        'max_good_distance': 5,
77        'max_heading_distance': 150,
78    },
79    'strict': { #recommended
80        'length_low': 70,
81        'length_high': 200,
82        'stopwords_low': 0.25,
83        'stopwords_high': 0.32,
84        'max_link_density': 0.3,
85        'max_good_distance': 5,
86        'max_heading_distance': 150,
87    },
88    'balanced': {
89        'length_low': 55,
90        'length_high': 140,
91        'stopwords_low': 0.2,
92        'stopwords_high': 0.3,
93        'max_link_density': 0.4,
94        'max_good_distance': 5,
95        'max_heading_distance': 200,
96    },
97    'permissive': {
98        'length_low': 40,
99        'length_high': 90,
100        'stopwords_low': 0.2,
101        'stopwords_high': 0.3,
102        'max_link_density': 0.45,
103        'max_good_distance': 10,
104        'max_heading_distance': 300,
105    },
106}
107
108def html2prevert(s, justext_wordlist, justext_level='strict', allowshort=False):
109    # TODO: preclean HTML (remove tables, infoboxes, References, TOC, ...)
110    try:
111        html_root = justext.preprocess(html_text=s, encoding='utf-8')
112        paragraphs = justext.make_paragraphs(html_root)
113    except (ParserError, XMLSyntaxError):
114        return ('', 0, 0)
115    #use Justext to classify paragraphs
116    j_length_low = JUSTEXT_PARAMS_BY_LEVEL[justext_level]['length_low']
117    j_length_high = JUSTEXT_PARAMS_BY_LEVEL[justext_level]['length_high']
118    j_max_heading_distance = JUSTEXT_PARAMS_BY_LEVEL[justext_level]['max_heading_distance']
119    if allowshort:
120        j_length_low = j_length_low / 3
121        j_length_high = j_length_high / 3
122        j_max_heading_distance = j_max_heading_distance / 3
123    justext.classify_paragraphs(
124        paragraphs=paragraphs,
125        stoplist=justext_wordlist,
126        length_low=j_length_low, #character count < length_low => bad or short
127        length_high=j_length_high, #character count > length_high => good
128        stopwords_low=JUSTEXT_PARAMS_BY_LEVEL[justext_level]['stopwords_low'], #number of words frequent in the language >= stopwords_low => neargood
129        stopwords_high=JUSTEXT_PARAMS_BY_LEVEL[justext_level]['stopwords_high'], #number of words frequent in the language >= stopwords_high => good or neargood
130        max_link_density=JUSTEXT_PARAMS_BY_LEVEL[justext_level]['max_link_density'] #density of link words (words inside the <a> tag) > max_link_density => bad
131    )
132    justext.revise_paragraph_classification(
133        paragraphs=paragraphs,
134        max_heading_distance=j_max_heading_distance #Short/near-good heads in the distance [chars] before a good par => good
135    )
136    #extract good paragraphs
137    prevert_paragraphs, paragraph_count, plaintext_len = [], 0, 0
138    for p in paragraphs:
139        #if p['class'] == 'good': # TODO find why this does not produce a good result
140        if p['cfclass'] in ('good', 'neargood'): #'good', 'neargood', 'short', 'bad'
141            p_text = justext.html_escape(p['text']).strip()
142            if p_text:
143                paragraph_count += 1
144                plaintext_len += len(p_text)
145                heading = u' heading="1"' if p['heading'] else u''
146                prevert_paragraphs.append(u'<p%s>\n%s\n</p>' % (heading, p_text))
147    return (u'\n'.join(prevert_paragraphs), paragraph_count, plaintext_len)
148
149def api_wait(last, wait_interval):
150    n = datetime.datetime.now()
151    interval = (n-last).seconds + ((n-last).microseconds / 1.0e6)
152    if interval < wait_interval:
153        time.sleep(wait_interval - interval)
154
155def process_page(langcode, title, linksf, raw_response_fp, justext_wordlist, logf,
156        last_api_parse, wait_interval, justext_level, allowshort):
157    api_wait(last_api_parse, wait_interval)
158    api_url = API_HTML % (langcode, title)
159    try:
160        response_data = urllib2.urlopen(api_url).read()
161        parse_time = datetime.datetime.now()
162    except (IOError, httplib.HTTPException):
163        # IOError includes both urllib2.URLError and socket.error (Python >= 2.6 for the latter)
164        raise MissingPage()
165    data = json.loads(response_data)
166    if not data or 'error' in data:
167        raise MissingPage()
168    #store the API response to allow re-processing without downloading in the future
169    raw_response_fp.write('%s\t%d\n' % (api_url, len(response_data) + 1))
170    raw_response_fp.write(response_data)
171    raw_response_fp.write('\n')
172    #parse the API response
173    p = data['parse']
174    html = p['text']['*'].strip()
175    if html:
176        #remove <a/> tags (Justext makes extra spaces there) # TODO: correct Justext and remove
177        html = remove_links_re.sub('', html)
178        prevert, paragraph_count, plaintext_len = html2prevert(
179            html.encode('utf-8'), justext_wordlist, justext_level, allowshort) # justext decodes!
180    else:
181        raise EmptyHTML()
182    if not prevert:
183        raise EmptyJusText()
184    revid = p['revid']
185    langlinks_len = len(p['langlinks'])
186    categories = '|'.join([d['*'].replace('"', '') for d in p['categories']])
187    if linksf and p['externallinks']:
188        linksf.write('### %s\n' % title)
189        for line in p['externallinks']:
190            linksf.write(line.encode('utf-8') + '\n')
191        linksf.write('\n')
192    print >>logf, '\t%d chars' % plaintext_len
193    page_attrs = 'url="%s" title="%s" categories="%s" translations="%d" paragraphs="%d" chars="%d" downloaded="%s"' % \
194            ((WIKI_URL % langcode) + title, title, categories.encode('utf-8'),
195            langlinks_len, paragraph_count, plaintext_len,
196            parse_time.strftime('%Y-%m-%d %H:%M'))
197    page = '<doc %s>\n%s\n</doc>\n' % (page_attrs, prevert.encode('utf-8'))
198    return (page, paragraph_count, revid, parse_time)
199
200def go_page(langcode, title, linksf, raw_response_fp, newest, justext_wordlist, logf, last_api_request,
201        last_api_parse, wait_interval, justext_level, allowshort, cache, cf, hits_by_type):
202    page, parlen, revid = '', 0, 0
203    if title in cache: # if page has been found in cache
204        if newest: # prepare to download the newest revision
205            previous_revid = cache[title]
206            api_wait(last_api_request, wait_interval)
207            last_api_request = datetime.datetime.now()
208            resp = urllib2.urlopen(API_JSON % (langcode, title))
209            data = json.load(resp)
210            dqp = data['query']['pages']
211            for key in dqp.keys():
212                try:
213                    current_revid = dqp[key]['revisions'][0]['revid']
214                except (KeyError, IndexError):
215                    print >>logf, '\tusing old revision %s instead of the newest ' \
216                        'revision (invalid Wiki API response data)' % previous_revid
217                    current_revid = previous_revid
218                if current_revid == previous_revid: # skip if cached is already newest
219                    hits_by_type['skipped'] += 1
220                    print >>logf, '\tskipping cached'
221                    return (page, parlen, revid, last_api_parse)
222        else: # skip because in cache
223            hits_by_type['skipped'] += 1
224            print >>logf, '\tskipping already downloaded'
225            return (page, parlen, revid, last_api_parse)
226    # download the page
227    try:
228        page, parlen, revid, last_api_parse =\
229                process_page(langcode, title, linksf,
230                raw_response_fp, justext_wordlist, logf,
231                last_api_parse, wait_interval,
232                justext_level, allowshort)
233        cache[title] = revid
234        cf.write('%s\t%s\n' % (title, revid))
235        hits_by_type['processed'] += 1
236        print >>logf, '\t%d paragraphs' % parlen
237    except (MissingPage, EmptyHTML, EmptyJusText) as e:
238        page = ''
239        hits_by_type['empty'] += 1
240        print >>logf, {
241                'MissingPage': '\tempty because not found',
242                'EmptyHTML': '\tempty HTML parse returned by API',
243                'EmptyJusText': '\tempty prevert returned by jusText'} \
244                [type(e).__name__]
245    return (page, parlen, revid, last_api_parse)
246
247def main(langcode, cachefn, raw_response_path, newest, links, justext_wordlist,
248        logf, wait_interval, nicetitles, talkpages, justext_level, allowshort):
249    last_api_request = datetime.datetime.now()
250    last_api_parse   = datetime.datetime.now()
251
252    linksf = open(links, 'a') if links else None
253    cache = {}
254    if os.path.exists(cachefn):
255        print >>logf, 'Cache: %s' % cachefn
256        with open(cachefn) as cf:
257            for line in cf:
258                try:
259                    title, revid = line.split('\t')
260                    cache[title.strip()] = revid.strip()
261                except ValueError:
262                    continue
263    cf = open(cachefn, 'a') # cache file
264    raw_response_fp = open(raw_response_path, 'a') #raw API responses
265    wikidump_titles_path = LATEST % (langcode.replace('-', '_'), langcode.replace('-', '_'))
266    print >>logf, 'Getting all titles from latest Wikipedia dump %s' % wikidump_titles_path
267    hits_by_type = {'processed': 0, 'skipped': 0, 'empty': 0}
268    filename, _ = urllib.urlretrieve(wikidump_titles_path)
269    with gzip.open(filename) as df:
270        for line in df:
271            title = line.strip().replace('"', "'")
272            # TODO: filter titles, use RE as parameter
273            print >>logf, '%s' % title
274            if nicetitles and not unicode_char_category(unicode(title, 'utf-8')[0])[0] == 'L':
275                print >>logf, '\tskipping (not a nice title)', title
276                continue
277            for page_title in filter(None, [title, 'Talk:' + title if talkpages else None]):
278                page, parlen, revid, last_api_parse =\
279                        go_page(langcode, page_title, linksf,
280                        raw_response_fp, newest, justext_wordlist, logf,
281                        last_api_request, last_api_parse, wait_interval,
282                        justext_level, allowshort, cache, cf, hits_by_type)
283                if page:
284                    sys.stdout.write(page)
285    print >>logf, 'Updated cache database stored in %s' % cachefn
286    if linksf:
287        linksf.close()
288    cf.close()
289    for hit_type, hit_count in hits_by_type.items():
290        print >>logf, '%s: %d' % (hit_type.title(), hit_count)
291
292if __name__ == '__main__':
293    parser = argparse.ArgumentParser(description='Wikipedia downloader')
294    parser.add_argument('langcode', help='Wikipedia language prefix', type=str)
295    parser.add_argument('wordlist', help='Path to a list of ~2000 most frequent words in the language (UTF-8, one per line)', type=str)
296    parser.add_argument('--cache', help='File with previously downloaded pages and data', type=str, default='')
297    parser.add_argument('--wait', help='Time interval between GET requests', type=float, default=1.0)
298    parser.add_argument('--newest', help='Download the newest versions of articles', action='store_true')
299    parser.add_argument('--links', help='Gather external links from Wikipedia', type=str, default='')
300    parser.add_argument('--nicetitles', help='Download only titles starting with alphabetical character', action='store_true')
301    parser.add_argument('--talkpages', help='Download talk pages', action='store_true')
302    parser.add_argument('--cleaning', help='Level of Justext boilerplate & short paragraph removal strictness (default = strict)',
303        type=str, choices=('verystrict', 'strict', 'balanced', 'permissive'), default='strict')
304    parser.add_argument('--allowshort', help='Allow three times shorter texts. Useful for ideographic scripts.', action='store_true')
305    args = parser.parse_args()
306    cachefile = args.cache or args.langcode + 'wiki.cache'
307    raw_response_file = (args.cache or args.langcode) + '_raw_data'
308    with open(args.wordlist) as fp:
309        justext_wordlist = set([line.decode('utf-8').rstrip() for line in fp])
310    logfn = args.langcode.replace('/','') + '_' + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '.log'
311    with open(logfn, 'w', buffering=1) as logfile:
312        main(args.langcode, cachefile, raw_response_file, args.newest, args.links,
313            justext_wordlist, logfile, args.wait, args.nicetitles, args.talkpages,
314            args.cleaning, args.allowshort)