Downloads: wiki2corpus-1.0.1.py

File wiki2corpus-1.0.1.py, 11.2 KB (added by admin, 9 months ago)
Line 
1#!/usr/bin/python
2#coding=utf-8
3
4"""
5MIT License
6
7Copyright (c) 2016 Vit Baisa, Vit Suchomel
8
9Permission is hereby granted, free of charge, to any person obtaining a copy
10of this software and associated documentation files (the "Software"), to deal
11in the Software without restriction, including without limitation the rights
12to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13copies of the Software, and to permit persons to whom the Software is
14furnished to do so, subject to the following conditions:
15
16The above copyright notice and this permission notice shall be included in all
17copies or substantial portions of the Software.
18
19THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25SOFTWARE.
26"""
27
28VERSION = '1.0.1'
29
30import re
31import os
32import sys
33import argparse
34import urllib2
35import urllib
36import socket #socket.error
37import datetime
38import json
39import time
40import codecs
41import datetime
42import gzip
43from justext import core as justext
44from lxml.etree import XMLSyntaxError, ParserError
45remove_links_re = re.compile(u'</?a[^>]*>')
46
47from unicodedata import category as unicode_char_category
48
49class MissingPage(Exception):
50    pass
51
52class EmptyHTML(Exception):
53    pass
54
55class EmptyJusText(Exception):
56    pass
57
58LATEST = 'https://dumps.wikimedia.org/%swiki/latest/%swiki-latest-all-titles-in-ns0.gz'
59WIKI_URL = 'https://%s.wikipedia.org/wiki/'
60API_HTML = 'https://%s.wikipedia.org/w/api.php?action=parse&page=%s&format=json'
61API_JSON = 'https://%s.wikipedia.org/w/api.php?action=query&prop=revisions&titles=%s&format=json'
62
63# TODO: look at https://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=json&titles=xxx
64
65def html2prevert(s, justext_wordlist):
66    # TODO: preclean HTML (remove tables, infoboxes, References, TOC, ...)
67    try:
68        html_root = justext.preprocess(html_text=s, encoding='utf-8')
69        paragraphs = justext.make_paragraphs(html_root)
70    except (ParserError, XMLSyntaxError):
71        return ('', 0, 0)
72    #use Justext to classify paragraphs
73    justext.classify_paragraphs(
74        paragraphs=paragraphs,
75        stoplist=justext_wordlist,
76        length_low=70, #character count < length_low => bad or short
77        length_high=200, #character count > length_high => good
78        stopwords_low=0.2, #number of words frequent in the language >= stopwords_low => neargood
79        stopwords_high=0.3, #number of words frequent in the language >= stopwords_high => good or neargood
80        max_link_density=0.4 #density of link words (words inside the <a> tag) > max_link_density => bad
81    )
82    justext.revise_paragraph_classification(
83        paragraphs=paragraphs,
84        max_heading_distance=200, #Short/near-good heads in the distance [chars] before a good par => good
85    )
86    #extract good paragraphs
87    prevert_paragraphs, paragraph_count, plaintext_len = [], 0, 0
88    for p in paragraphs:
89        #if p['class'] == 'good': # TODO find why this does not produce a good result
90        if p['cfclass'] in ('good', 'neargood'): #'good', 'neargood', 'short', 'bad'
91            p_text = justext.html_escape(p['text']).strip()
92            if p_text:
93                paragraph_count += 1
94                plaintext_len += len(p_text)
95                heading = u' heading="1"' if p['heading'] else u''
96                prevert_paragraphs.append(u'<p%s>\n%s\n</p>' % (heading, p_text))
97    return (u'\n'.join(prevert_paragraphs), paragraph_count, plaintext_len)
98
99def api_wait(last, wait_interval):
100    n = datetime.datetime.now()
101    interval = (n-last).seconds + ((n-last).microseconds / 1.0e6)
102    if interval < wait_interval:
103        time.sleep(wait_interval - interval)
104
105def process_article(langcode, title, linksf, raw_response_fp, justext_wordlist, logf, last_api_parse, wait_interval):
106    api_wait(last_api_parse, wait_interval)
107    api_url = API_HTML % (langcode, title)
108    try:
109        response_data = urllib2.urlopen(api_url).read()
110        parse_time = datetime.datetime.now()
111    except urllib2.HTTPError:
112        raise MissingPage()
113    try:
114        data = json.loads(response_data)
115    except socket.error:
116        raise MissingPage()
117    if not data or 'error' in data:
118        raise MissingPage()
119    #store the API response to allow re-processing without downloading in the future
120    raw_response_fp.write('%s\t%d\n' % (api_url, len(response_data) + 1))
121    raw_response_fp.write(response_data)
122    raw_response_fp.write('\n')
123    #parse the API response
124    p = data['parse']
125    html = p['text']['*'].strip()
126    if html:
127        #remove <a/> tags (Justext makes extra spaces there) # TODO: correct Justext and remove
128        html = remove_links_re.sub('', html)
129        prevert, paragraph_count, plaintext_len = html2prevert(
130            html.encode('utf-8'), justext_wordlist) # justext decodes!
131    else:
132        print >>logf, '\tempty HTML parse returned by API'
133        raise EmptyHTML()
134    if not prevert:
135        print >>logf, '\tempty prevert returned by jusText'
136        raise EmptyJusText()
137    revid = p['revid']
138    langlinks_len = len(p['langlinks'])
139    categories = '|'.join([d['*'].replace('"', '') for d in p['categories']])
140    if linksf and p['externallinks']:
141        linksf.write('### %s\n' % title)
142        for line in p['externallinks']:
143            linksf.write(line.encode('utf-8') + '\n')
144        linksf.write('\n')
145    print >>logf, '\t%d chars' % plaintext_len
146    article_attrs = 'url="%s" title="%s" categories="%s" translations="%d" paragraphs="%d" chars="%d" downloaded="%s"' % \
147            ((WIKI_URL % langcode) + title, title, categories.encode('utf-8'),
148            langlinks_len, paragraph_count, plaintext_len,
149            parse_time.strftime('%Y-%m-%d %H:%M'))
150    article = '<doc %s>\n%s\n</doc>\n' % (article_attrs, prevert.encode('utf-8'))
151    return (article, paragraph_count, revid, parse_time)
152
153def main(langcode, cachefn, raw_response_path, logfn, newest, links, justext_wordlist, logf, wait_interval, nicetitles):
154    last_api_request = datetime.datetime.now()
155    last_api_parse   = datetime.datetime.now()
156
157    linksf = open(links, 'a') if links else None
158    cache = {}
159    if os.path.exists(cachefn):
160        print >>logf, 'Cache: %s' % cachefn
161        with open(cachefn) as cf:
162            for line in cf:
163                try:
164                    title, revid = line.split('\t')
165                    cache[title.strip()] = revid.strip()
166                except ValueError:
167                    continue
168    cf = open(cachefn, 'a') # cache file
169    raw_response_fp = open(raw_response_path, 'a') #raw API responses
170    print >>logf, 'Getting all titles from latest Wikipedia dump'
171    processed_articles = 0
172    skipped_articles = 0
173    empty_articles = 0
174    filename, _ = urllib.urlretrieve(LATEST % (langcode, langcode))
175    with gzip.open(filename) as df:
176        for line in df:
177            # TODO: download also talk pages
178            title = line.strip().replace('"', "'")
179            # TODO: filter titles, use RE as parameter
180            print >>logf, '%s' % title
181            if nicetitles and not unicode_char_category(unicode(title, 'utf-8')[0])[0] == 'L':
182                print >>logf, '\tskipping (not a nice title)'
183                continue
184            if title in cache:
185                if newest: # download the newest revision
186                    previous_revid = cache[title]
187                    api_wait(last_api_request, wait_interval)
188                    last_api_request = datetime.datetime.now()
189                    resp = urllib2.urlopen(API_JSON % (langcode, title))
190                    data = json.load(response)
191                    try:
192                        dqp = data['query']['pages']
193                        for key in dqp.keys():
194                            current_revid = dqp[key]['revisions']['revid']
195                            if previous_revid != current_revid:
196                                article, parlen, revid, last_api_parse =\
197                                        process_article(langcode, title, linksf,
198                                        raw_response_fp, justext_wordlist, logf,
199                                        last_api_parse, wait_interval)
200                                cache[title] = revid
201                                cf.write(title + '\t' + revid + '\n')
202                            else:
203                                print >>logf, '\tskipping cached'
204                    except (MissingPage, EmptyHTML, EmptyJusText):
205                        article = ''
206                        empty_articles += 1
207                else:
208                    # do not download
209                    print >>logf, '\tskip already downloaded'
210                    skipped_articles += 1
211                    article = ''
212            else:
213                try:
214                    article, parlen, revid, last_api_parse =\
215                            process_article(langcode, title, linksf,
216                            raw_response_fp, justext_wordlist, logf,
217                            last_api_parse, wait_interval)
218                    cache[title] = revid
219                    cf.write('%s\t%d\n' % (title, revid))
220                    print >>logf, '\t%d paragraphs' % parlen
221                    processed_articles += 1
222                except (MissingPage, EmptyHTML, EmptyJusText):
223                    article = ''
224                    empty_articles += 1
225            if article:
226                sys.stdout.write(article)
227    print >>logf, 'Updated cache database stored in %s' % cachefn
228    linksf.close()
229    cf.close()
230    print >>logf, 'Processed: %d' % processed_articles
231    print >>logf, 'Empty: %d' % empty_articles
232    print >>logf, 'Skipped: %d' % skipped_articles
233    logf.close()
234
235if __name__ == '__main__':
236    parser = argparse.ArgumentParser(description='Wikipedia downloader')
237    parser.add_argument('langcode', help='Wikipedia language prefix', type=str)
238    parser.add_argument('wordlist', help='Path to a list of ~2000 most frequent words in the language (UTF-8, one per line)', type=str)
239    parser.add_argument('--cache', help='Directory with previously downloaded pages and data', type=str, default='')
240    parser.add_argument('--wait', help='Time interval between GET requests', type=float, default=1.0)
241    parser.add_argument('--newest', help='Download the newest versions of articles', action='store_true')
242    parser.add_argument('--links', help='Gather external links from Wikipedia', type=str, default='')
243    parser.add_argument('--nicetitles', help='Download only titles starting with alphabetical character', action='store_true')
244    args = parser.parse_args()
245    logfn = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '.log'
246    logfile = open(logfn, 'w')
247    cachefile = args.cache or args.langcode + 'wiki.cache'
248    raw_response_file = (args.cache or args.langcode) + '_raw_data'
249    with open(args.wordlist) as fp:
250        justext_wordlist = set([line.decode('utf-8').rstrip() for line in fp])
251    main(args.langcode, cachefile, raw_response_file, logfile, args.newest, args.links, justext_wordlist, logfile, args.wait, args.nicetitles)