diff --git a/Makefile.config b/Makefile.config
index 0d4c90d..be9c42c 100644
a
|
b
|
|
1 | | #JUDY_INC=-I/opt/local/include |
2 | | #JUDY_LIB=-L/opt/local/lib |
3 | | |
4 | 1 | PREFIX=/usr/local |
5 | 2 | INSTALL_BIN=$(PREFIX)/bin |
6 | | INSTALL_DATA=$(PREFIX)/share |
7 | | No newline at end of file |
| 3 | INSTALL_DATA=$(PREFIX)/share |
diff --git a/src/Makefile b/src/Makefile
index 2426051..72f5009 100644
a
|
b
|
|
1 | 1 | include ../Makefile.config |
2 | 2 | |
3 | | CC=gcc |
4 | | CFLAGS=-Wall -O3 $(JUDY_INC) |
5 | | LIBS=$(JUDY_LIB) -lJudy |
| 3 | CC=g++ |
| 4 | CFLAGS=-Wall -O3 |
6 | 5 | |
7 | 6 | OBJS=version.o buzhash.o |
8 | 7 | TARGETS=hashgen hashdup onion |
diff --git a/src/onion.c b/src/onion.c
index 08dca9a..1dc1327 100644
a
|
b
|
|
13 | 13 | #include <unistd.h> |
14 | 14 | #include <sys/time.h> |
15 | 15 | #include <sys/resource.h> |
16 | | #include <Judy.h> |
17 | 16 | #include "buzhash.h" |
18 | 17 | #include "version.h" |
19 | 18 | |
| 19 | #if defined __GNUC__ || defined __APPLE__ |
| 20 | #include <ext/hash_map> |
| 21 | namespace std { using namespace __gnu_cxx; } |
| 22 | #else |
| 23 | #include <hash_map> |
| 24 | #endif |
| 25 | using namespace std; |
| 26 | typedef hash_map<uint64_t,bool> ngrhash; |
| 27 | |
20 | 28 | #define BITMASK_HIGH63 0xfffffffffffffffeul |
21 | 29 | |
22 | 30 | #define NGRAM_SIZE 7 |
… |
… |
int main(int argc, char **argv) {
|
216 | 224 | buzhash_buffer_t bh_buffer; |
217 | 225 | buzhash_init_buffer(&bh_buffer, Ngram_size); |
218 | 226 | |
219 | | // judy - for global duplicates |
220 | | int judy_rc; |
221 | | Pvoid_t judy = (Pvoid_t) NULL; |
222 | | |
223 | | // ljudy - for local (document level) duplicates |
224 | | Pvoid_t ljudy = (Pvoid_t) NULL; |
| 227 | ngrhash global, local; |
225 | 228 | |
226 | 229 | // read hashes of duplicate n-grams if available |
227 | 230 | int have_dupl_ngrams = 0; |
… |
… |
int main(int argc, char **argv) {
|
244 | 247 | hash_t masked_hash = hash & hash_bitmask; |
245 | 248 | // store only the 63 most significant bits of the hash; |
246 | 249 | // reserve the last bit as a flag (seen / unseen) |
247 | | J1S(judy_rc, judy, masked_hash & BITMASK_HIGH63); |
| 250 | global[masked_hash & BITMASK_HIGH63] = true; |
248 | 251 | |
249 | 252 | // print progress information |
250 | 253 | if (!Quiet && bytes_read % (10000000 * sizeof(hash)) == 0) { |
… |
… |
int main(int argc, char **argv) {
|
331 | 334 | int doc_i; |
332 | 335 | for (doc_i=0; doc_i<doc_count-1; doc_i++) { |
333 | 336 | buzhash_clear_buffer(&bh_buffer); |
334 | | J1FA(judy_rc, ljudy); |
| 337 | local.clear(); |
335 | 338 | // for all paragraphs in the document |
336 | 339 | int par_i; |
337 | 340 | for (par_i=docs[doc_i]; par_i<docs[doc_i+1]; par_i++) { |
… |
… |
int main(int argc, char **argv) {
|
359 | 362 | hash_t masked_hash = hash & hash_bitmask; |
360 | 363 | if (!buzhash_is_full_buffer(&bh_buffer)) |
361 | 364 | continue; |
362 | | J1T(judy_rc, ljudy, hash); |
363 | | if (!judy_rc) { |
| 365 | ngrhash::const_iterator it = local.find (hash); |
| 366 | if (it == local.end()) { |
364 | 367 | if (have_dupl_ngrams) { |
365 | 368 | // test with the last bit set to 1 |
366 | 369 | // (check against already seen duplicate ngrams) |
367 | | J1T(judy_rc, judy, masked_hash | 1); |
| 370 | it = global.find (masked_hash | 1); |
368 | 371 | } |
369 | 372 | else { |
370 | | J1T(judy_rc, judy, masked_hash); |
| 373 | it = global.find (masked_hash); |
371 | 374 | } |
372 | 375 | } |
373 | | if (judy_rc) { |
| 376 | if (it != global.end()) { |
374 | 377 | bad_tokens+= Ngram_size - prev_bad_tokens; |
375 | 378 | prev_bad_tokens = Ngram_size; |
376 | 379 | } |
377 | | J1S(judy_rc, ljudy, hash); |
| 380 | local[hash] = true; |
378 | 381 | } |
379 | 382 | |
380 | 383 | // remember the length of the paragraph |
… |
… |
int main(int argc, char **argv) {
|
455 | 458 | // stored hash to 1 if we have seen the matching |
456 | 459 | // duplicate n-gram to indicate it has been seen. |
457 | 460 | // Unique n-grams are ignored. |
458 | | J1U(judy_rc, judy, masked_hash & BITMASK_HIGH63); |
459 | | if (judy_rc) |
460 | | J1S(judy_rc, judy, masked_hash | 1); |
| 461 | if (global.erase (masked_hash & BITMASK_HIGH63)) |
| 462 | global[masked_hash | 1] = true; |
461 | 463 | } |
462 | 464 | else { |
463 | 465 | // otherwise we have to store hashes of all n-grams |
464 | | J1S(judy_rc, judy, masked_hash); |
| 466 | global[masked_hash] = true; |
465 | 467 | } |
466 | 468 | } |
467 | 469 | } |
… |
… |
int main(int argc, char **argv) {
|
502 | 504 | |
503 | 505 | return 0; |
504 | 506 | } |
| 507 | |
| 508 | // vim: ts=4 sw=4 sta et sts=4 si cindent tw=80: |