1 | #!/usr/bin/python
|
---|
2 |
|
---|
3 | """
|
---|
4 | MIT License
|
---|
5 |
|
---|
6 | Copyright (c) 2016 Vit Baisa
|
---|
7 |
|
---|
8 | Permission is hereby granted, free of charge, to any person obtaining a copy
|
---|
9 | of this software and associated documentation files (the "Software"), to deal
|
---|
10 | in the Software without restriction, including without limitation the rights
|
---|
11 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
---|
12 | copies of the Software, and to permit persons to whom the Software is
|
---|
13 | furnished to do so, subject to the following conditions:
|
---|
14 |
|
---|
15 | The above copyright notice and this permission notice shall be included in all
|
---|
16 | copies or substantial portions of the Software.
|
---|
17 |
|
---|
18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
---|
19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
---|
20 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
---|
21 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
---|
22 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
---|
23 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
---|
24 | SOFTWARE.
|
---|
25 | """
|
---|
26 |
|
---|
27 | import os
|
---|
28 | import sys
|
---|
29 | import argparse
|
---|
30 | import urllib2
|
---|
31 | import urllib
|
---|
32 | import datetime
|
---|
33 | import json
|
---|
34 | import time
|
---|
35 | import datetime
|
---|
36 | import gzip
|
---|
37 | from justext import core as justext
|
---|
38 | from lxml.etree import XMLSyntaxError, ParserError
|
---|
39 |
|
---|
40 | class MissingPage(Exception):
|
---|
41 | pass
|
---|
42 |
|
---|
43 | class EmptyHTML(Exception):
|
---|
44 | pass
|
---|
45 |
|
---|
46 | class EmptyJusText(Exception):
|
---|
47 | pass
|
---|
48 |
|
---|
49 | LATEST = 'https://dumps.wikimedia.org/%swiki/latest/%swiki-latest-all-titles-in-ns0.gz'
|
---|
50 | API_HTML = 'https://%s.wikipedia.org/w/api.php?action=parse&page=%s&format=json'
|
---|
51 | API_JSON = 'https://%s.wikipedia.org/w/api.php?action=query&prop=revisions&titles=%s&format=json'
|
---|
52 |
|
---|
53 | last_api_request = datetime.datetime.now()
|
---|
54 | last_api_parse = datetime.datetime.now()
|
---|
55 | logf = None
|
---|
56 |
|
---|
57 | def html2prevert(s):
|
---|
58 | try:
|
---|
59 | html_root = justext.preprocess(html_text=s, encoding='utf-8')
|
---|
60 | return justext.make_paragraphs(html_root)
|
---|
61 | except (ParserError, XMLSyntaxError):
|
---|
62 | return []
|
---|
63 |
|
---|
64 | def api_wait(last):
|
---|
65 | global wait_interval
|
---|
66 | global logf
|
---|
67 | n = datetime.datetime.now()
|
---|
68 | interval = (n-last).seconds + ((n-last).microseconds / 1.0e6)
|
---|
69 | if interval < wait_interval:
|
---|
70 | time.sleep(wait_interval - interval)
|
---|
71 |
|
---|
72 | def process_article(langcode, title, linksf):
|
---|
73 | global last_api_parse
|
---|
74 | global logf
|
---|
75 | api_wait(last_api_parse)
|
---|
76 | last_api_parse = datetime.datetime.now()
|
---|
77 | resp = urllib2.urlopen(API_HTML % (langcode, title))
|
---|
78 | data = json.load(resp)
|
---|
79 | if 'error' in data:
|
---|
80 | raise MissingPage()
|
---|
81 | p = data['parse']
|
---|
82 | html = p['text']['*']
|
---|
83 | if html.strip():
|
---|
84 | pars = html2prevert(html.encode('utf-8')) # justext decodes!
|
---|
85 | else:
|
---|
86 | print >>logf, '\tempty HTML parse returned by API'
|
---|
87 | raise EmptyHTML()
|
---|
88 | if not pars:
|
---|
89 | print >>logf, '\tempty prevert returned by jusText'
|
---|
90 | raise EmptyJusText()
|
---|
91 | outp = []
|
---|
92 | for par in pars:
|
---|
93 | parx = justext.html_escape(par['text'])
|
---|
94 | outp.append(parx)
|
---|
95 | revid = p['revid']
|
---|
96 | langlinks_len = len(p['langlinks'])
|
---|
97 | #links = '\v'.join([d['*'].replace('"', '') for d in p['links']])
|
---|
98 | categories = '\v'.join([d['*'].replace('"', '') for d in p['categories']])
|
---|
99 | if linksf and p['externallinks']:
|
---|
100 | linksf.write('### %s\n' % title)
|
---|
101 | for line in p['externallinks']:
|
---|
102 | linksf.write(line + '\n')
|
---|
103 | linksf.write('\n')
|
---|
104 | s = ''
|
---|
105 | chars = 0
|
---|
106 | for p in outp:
|
---|
107 | s += '<p>\n'
|
---|
108 | s += p
|
---|
109 | s += '\n</p>\n'
|
---|
110 | chars += len(p)
|
---|
111 | print >>logf, '\t%d chars' % chars
|
---|
112 | header = '<doc title="%s" categories="%s" translations="%d" paragraphs="%d" chars="%d">\n' %\
|
---|
113 | (title, categories.encode('utf-8'), langlinks_len,
|
---|
114 | #links.encode('utf-8'),
|
---|
115 | len(outp), chars)
|
---|
116 | return header + s.encode('utf-8') + '</doc>\n', len(outp), revid
|
---|
117 |
|
---|
118 | def main(langcode, cachefn, logfn, newest, links):
|
---|
119 | global logf
|
---|
120 | logf = open(logfn, 'w')
|
---|
121 | print >>sys.stderr, "Log will be stored in %s" % logfn
|
---|
122 | linksf = open(links, 'w') if links else None
|
---|
123 | cache = {}
|
---|
124 | if os.path.exists(cachefn):
|
---|
125 | print >>logf, 'Cache: %s' % cachefn
|
---|
126 | with open(cachefn) as cf:
|
---|
127 | for line in cf:
|
---|
128 | try:
|
---|
129 | title, revid = line.split('\t')
|
---|
130 | cache[title.strip()] = revid.strip()
|
---|
131 | except ValueError:
|
---|
132 | continue
|
---|
133 | cf = open(cachefn, 'w') # empty cache file
|
---|
134 | print >>logf, 'Getting all titles from latest Wikipedia dump'
|
---|
135 | processed_articles = 0
|
---|
136 | skipped_articles = 0
|
---|
137 | empty_articles = 0
|
---|
138 | filename, _ = urllib.urlretrieve(LATEST % (langcode, langcode))
|
---|
139 | with gzip.open(filename) as df:
|
---|
140 | for line in df:
|
---|
141 | title = line.strip().replace('"', "'")
|
---|
142 | print >>logf, '%s' % title
|
---|
143 | if title in cache:
|
---|
144 | if newest: # download the newest revision
|
---|
145 | previous_revid = cache[title]
|
---|
146 | api_wait(last_api_request)
|
---|
147 | last_api_request = datetime.datetime.now()
|
---|
148 | resp = urllib2.urlopen(API_JSON % (langcode, title))
|
---|
149 | data = json.load(response)
|
---|
150 | try:
|
---|
151 | dqp = data['query']['pages']
|
---|
152 | for key in dqp.keys():
|
---|
153 | current_revid = dqp[key]['revisions']['revid']
|
---|
154 | if previous_revid != current_revid:
|
---|
155 | article, parlen, revid = process_article(langcode, title, linksf)
|
---|
156 | cache[title] = revid
|
---|
157 | cf.write(title + '\t' + revid + '\n')
|
---|
158 | else:
|
---|
159 | print >>logf, '\tskipping cached'
|
---|
160 | except (MissingPage, EmptyHTML, EmptyJusText):
|
---|
161 | article = ''
|
---|
162 | empty_articles += 1
|
---|
163 | else:
|
---|
164 | # do not download
|
---|
165 | print >>logf, '\tskip already downloaded'
|
---|
166 | skipped_articles += 1
|
---|
167 | article = ''
|
---|
168 | else:
|
---|
169 | try:
|
---|
170 | article, parlen, revid = process_article(langcode, title, linksf)
|
---|
171 | cache[title] = revid
|
---|
172 | cf.write('%s\t%d\n' % (title, revid))
|
---|
173 | print >>logf, '\t%d paragraphs' % parlen
|
---|
174 | processed_articles += 1
|
---|
175 | except (MissingPage, EmptyHTML, EmptyJusText):
|
---|
176 | article = ''
|
---|
177 | empty_articles += 1
|
---|
178 | if article:
|
---|
179 | sys.stdout.write(article)
|
---|
180 | print >>logf, 'Updated cache database stored in %s' % cachefn
|
---|
181 | linksf.close()
|
---|
182 | cf.close()
|
---|
183 | print >>logf, 'Processed: %d' % processed_articles
|
---|
184 | print >>logf, 'Empty: %d' % empty_articles
|
---|
185 | print >>logf, 'Skipped: %d' % skipped_articles
|
---|
186 | logf.close()
|
---|
187 |
|
---|
188 | if __name__ == '__main__':
|
---|
189 | parser = argparse.ArgumentParser(description='Wikipedia downloader')
|
---|
190 | parser.add_argument('langcode', type=str, help='Wikipedia language prefix')
|
---|
191 | parser.add_argument('--cache', help='Directory with previously downloaded pages and data', type=str, default='')
|
---|
192 | parser.add_argument('--wait', help='Time interval between GET requests', type=float, default=1.0)
|
---|
193 | parser.add_argument('--newest', help='Download the newest versions of articles', action='store_true')
|
---|
194 | parser.add_argument('--links', help='Gather external links from Wikipedia', type=str, default='')
|
---|
195 | args = parser.parse_args()
|
---|
196 | wait_interval = args.wait
|
---|
197 |
|
---|
198 | current_time = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
|
---|
199 | logfile = current_time + '.log'
|
---|
200 | cachefile = args.cache or args.langcode + 'wiki.cache'
|
---|
201 | main(args.langcode, cachefile, logfile, args.newest, args.links)
|
---|