Downloads: wiki2corpus-1.0.1.py

File wiki2corpus-1.0.1.py, 11.2 KB (added by admin, 7 years ago)
Line 
1#!/usr/bin/python
2#coding=utf-8
3
4"""
5MIT License
6
7Copyright (c) 2016 Vit Baisa, Vit Suchomel
8
9Permission is hereby granted, free of charge, to any person obtaining a copy
10of this software and associated documentation files (the "Software"), to deal
11in the Software without restriction, including without limitation the rights
12to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13copies of the Software, and to permit persons to whom the Software is
14furnished to do so, subject to the following conditions:
15
16The above copyright notice and this permission notice shall be included in all
17copies or substantial portions of the Software.
18
19THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25SOFTWARE.
26"""
27
28VERSION = '1.0.1'
29
30import re
31import os
32import sys
33import argparse
34import urllib2
35import urllib
36import socket #socket.error
37import datetime
38import json
39import time
40import codecs
41import datetime
42import gzip
43from justext import core as justext
44from lxml.etree import XMLSyntaxError, ParserError
45remove_links_re = re.compile(u'</?a[^>]*>')
46
47from unicodedata import category as unicode_char_category
48
49class MissingPage(Exception):
50 pass
51
52class EmptyHTML(Exception):
53 pass
54
55class EmptyJusText(Exception):
56 pass
57
58LATEST = 'https://dumps.wikimedia.org/%swiki/latest/%swiki-latest-all-titles-in-ns0.gz'
59WIKI_URL = 'https://%s.wikipedia.org/wiki/'
60API_HTML = 'https://%s.wikipedia.org/w/api.php?action=parse&page=%s&format=json'
61API_JSON = 'https://%s.wikipedia.org/w/api.php?action=query&prop=revisions&titles=%s&format=json'
62
63# TODO: look at https://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=json&titles=xxx
64
65def html2prevert(s, justext_wordlist):
66 # TODO: preclean HTML (remove tables, infoboxes, References, TOC, ...)
67 try:
68 html_root = justext.preprocess(html_text=s, encoding='utf-8')
69 paragraphs = justext.make_paragraphs(html_root)
70 except (ParserError, XMLSyntaxError):
71 return ('', 0, 0)
72 #use Justext to classify paragraphs
73 justext.classify_paragraphs(
74 paragraphs=paragraphs,
75 stoplist=justext_wordlist,
76 length_low=70, #character count < length_low => bad or short
77 length_high=200, #character count > length_high => good
78 stopwords_low=0.2, #number of words frequent in the language >= stopwords_low => neargood
79 stopwords_high=0.3, #number of words frequent in the language >= stopwords_high => good or neargood
80 max_link_density=0.4 #density of link words (words inside the <a> tag) > max_link_density => bad
81 )
82 justext.revise_paragraph_classification(
83 paragraphs=paragraphs,
84 max_heading_distance=200, #Short/near-good heads in the distance [chars] before a good par => good
85 )
86 #extract good paragraphs
87 prevert_paragraphs, paragraph_count, plaintext_len = [], 0, 0
88 for p in paragraphs:
89 #if p['class'] == 'good': # TODO find why this does not produce a good result
90 if p['cfclass'] in ('good', 'neargood'): #'good', 'neargood', 'short', 'bad'
91 p_text = justext.html_escape(p['text']).strip()
92 if p_text:
93 paragraph_count += 1
94 plaintext_len += len(p_text)
95 heading = u' heading="1"' if p['heading'] else u''
96 prevert_paragraphs.append(u'<p%s>\n%s\n</p>' % (heading, p_text))
97 return (u'\n'.join(prevert_paragraphs), paragraph_count, plaintext_len)
98
99def api_wait(last, wait_interval):
100 n = datetime.datetime.now()
101 interval = (n-last).seconds + ((n-last).microseconds / 1.0e6)
102 if interval < wait_interval:
103 time.sleep(wait_interval - interval)
104
105def process_article(langcode, title, linksf, raw_response_fp, justext_wordlist, logf, last_api_parse, wait_interval):
106 api_wait(last_api_parse, wait_interval)
107 api_url = API_HTML % (langcode, title)
108 try:
109 response_data = urllib2.urlopen(api_url).read()
110 parse_time = datetime.datetime.now()
111 except urllib2.HTTPError:
112 raise MissingPage()
113 try:
114 data = json.loads(response_data)
115 except socket.error:
116 raise MissingPage()
117 if not data or 'error' in data:
118 raise MissingPage()
119 #store the API response to allow re-processing without downloading in the future
120 raw_response_fp.write('%s\t%d\n' % (api_url, len(response_data) + 1))
121 raw_response_fp.write(response_data)
122 raw_response_fp.write('\n')
123 #parse the API response
124 p = data['parse']
125 html = p['text']['*'].strip()
126 if html:
127 #remove <a/> tags (Justext makes extra spaces there) # TODO: correct Justext and remove
128 html = remove_links_re.sub('', html)
129 prevert, paragraph_count, plaintext_len = html2prevert(
130 html.encode('utf-8'), justext_wordlist) # justext decodes!
131 else:
132 print >>logf, '\tempty HTML parse returned by API'
133 raise EmptyHTML()
134 if not prevert:
135 print >>logf, '\tempty prevert returned by jusText'
136 raise EmptyJusText()
137 revid = p['revid']
138 langlinks_len = len(p['langlinks'])
139 categories = '|'.join([d['*'].replace('"', '') for d in p['categories']])
140 if linksf and p['externallinks']:
141 linksf.write('### %s\n' % title)
142 for line in p['externallinks']:
143 linksf.write(line.encode('utf-8') + '\n')
144 linksf.write('\n')
145 print >>logf, '\t%d chars' % plaintext_len
146 article_attrs = 'url="%s" title="%s" categories="%s" translations="%d" paragraphs="%d" chars="%d" downloaded="%s"' % \
147 ((WIKI_URL % langcode) + title, title, categories.encode('utf-8'),
148 langlinks_len, paragraph_count, plaintext_len,
149 parse_time.strftime('%Y-%m-%d %H:%M'))
150 article = '<doc %s>\n%s\n</doc>\n' % (article_attrs, prevert.encode('utf-8'))
151 return (article, paragraph_count, revid, parse_time)
152
153def main(langcode, cachefn, raw_response_path, logfn, newest, links, justext_wordlist, logf, wait_interval, nicetitles):
154 last_api_request = datetime.datetime.now()
155 last_api_parse = datetime.datetime.now()
156
157 linksf = open(links, 'a') if links else None
158 cache = {}
159 if os.path.exists(cachefn):
160 print >>logf, 'Cache: %s' % cachefn
161 with open(cachefn) as cf:
162 for line in cf:
163 try:
164 title, revid = line.split('\t')
165 cache[title.strip()] = revid.strip()
166 except ValueError:
167 continue
168 cf = open(cachefn, 'a') # cache file
169 raw_response_fp = open(raw_response_path, 'a') #raw API responses
170 print >>logf, 'Getting all titles from latest Wikipedia dump'
171 processed_articles = 0
172 skipped_articles = 0
173 empty_articles = 0
174 filename, _ = urllib.urlretrieve(LATEST % (langcode, langcode))
175 with gzip.open(filename) as df:
176 for line in df:
177 # TODO: download also talk pages
178 title = line.strip().replace('"', "'")
179 # TODO: filter titles, use RE as parameter
180 print >>logf, '%s' % title
181 if nicetitles and not unicode_char_category(unicode(title, 'utf-8')[0])[0] == 'L':
182 print >>logf, '\tskipping (not a nice title)'
183 continue
184 if title in cache:
185 if newest: # download the newest revision
186 previous_revid = cache[title]
187 api_wait(last_api_request, wait_interval)
188 last_api_request = datetime.datetime.now()
189 resp = urllib2.urlopen(API_JSON % (langcode, title))
190 data = json.load(response)
191 try:
192 dqp = data['query']['pages']
193 for key in dqp.keys():
194 current_revid = dqp[key]['revisions']['revid']
195 if previous_revid != current_revid:
196 article, parlen, revid, last_api_parse =\
197 process_article(langcode, title, linksf,
198 raw_response_fp, justext_wordlist, logf,
199 last_api_parse, wait_interval)
200 cache[title] = revid
201 cf.write(title + '\t' + revid + '\n')
202 else:
203 print >>logf, '\tskipping cached'
204 except (MissingPage, EmptyHTML, EmptyJusText):
205 article = ''
206 empty_articles += 1
207 else:
208 # do not download
209 print >>logf, '\tskip already downloaded'
210 skipped_articles += 1
211 article = ''
212 else:
213 try:
214 article, parlen, revid, last_api_parse =\
215 process_article(langcode, title, linksf,
216 raw_response_fp, justext_wordlist, logf,
217 last_api_parse, wait_interval)
218 cache[title] = revid
219 cf.write('%s\t%d\n' % (title, revid))
220 print >>logf, '\t%d paragraphs' % parlen
221 processed_articles += 1
222 except (MissingPage, EmptyHTML, EmptyJusText):
223 article = ''
224 empty_articles += 1
225 if article:
226 sys.stdout.write(article)
227 print >>logf, 'Updated cache database stored in %s' % cachefn
228 linksf.close()
229 cf.close()
230 print >>logf, 'Processed: %d' % processed_articles
231 print >>logf, 'Empty: %d' % empty_articles
232 print >>logf, 'Skipped: %d' % skipped_articles
233 logf.close()
234
235if __name__ == '__main__':
236 parser = argparse.ArgumentParser(description='Wikipedia downloader')
237 parser.add_argument('langcode', help='Wikipedia language prefix', type=str)
238 parser.add_argument('wordlist', help='Path to a list of ~2000 most frequent words in the language (UTF-8, one per line)', type=str)
239 parser.add_argument('--cache', help='Directory with previously downloaded pages and data', type=str, default='')
240 parser.add_argument('--wait', help='Time interval between GET requests', type=float, default=1.0)
241 parser.add_argument('--newest', help='Download the newest versions of articles', action='store_true')
242 parser.add_argument('--links', help='Gather external links from Wikipedia', type=str, default='')
243 parser.add_argument('--nicetitles', help='Download only titles starting with alphabetical character', action='store_true')
244 args = parser.parse_args()
245 logfn = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '.log'
246 logfile = open(logfn, 'w')
247 cachefile = args.cache or args.langcode + 'wiki.cache'
248 raw_response_file = (args.cache or args.langcode) + '_raw_data'
249 with open(args.wordlist) as fp:
250 justext_wordlist = set([line.decode('utf-8').rstrip() for line in fp])
251 main(args.langcode, cachefile, raw_response_file, logfile, args.newest, args.links, justext_wordlist, logfile, args.wait, args.nicetitles)