all: eval_summary


#== GET LANGUAGE IDENTIFICATION TOOLS ==
src_langfilter:
	mkdir $@
	wget -O $@/wcwb_lang_filter_1.0.tar.gz http://corpus.tools/raw-attachment/wiki/Downloads/wcwb_lang_filter_1.0.tar.gz
	tar -C $@ -xzvf $@/wcwb_lang_filter_1.0.tar.gz
	mv -iv $@/wcwb_lang_filter_1.0/* $@/
	rm -r $@/{wcwb_lang_filter_1.0,wcwb_lang_filter_1.0.tar.gz}

src_langid:
	mkdir $@
	wget -O $@/master.zip https://github.com/saffsd/langid.py/archive/master.zip
	unzip -d $@/ $@/master.zip
	mv -iv $@/langid.py-master/* $@/
	rm -r $@/{langid.py-master,master.zip}

src_langdetect:
	mkdir $@
	wget -O $@/master.zip https://github.com/Mimino666/langdetect/archive/master.zip
	unzip -d $@/ $@/master.zip
	mv -iv $@/langdetect-master/* $@/
	rm -r $@/{langdetect-master,master.zip}


#== RUN TOOLS ==
out/langid: gold_data.txt src_langid
	cut -f2 $< | python3 langid_wrapper.py > $@

out/langdetect: gold_data.txt src_langdetect
	cut -f2 $< | python3 langdetect_wrapper.py > $@

out/langfilter: gold_data.txt src_langfilter
	cut -f2 $< | sed -r 's,.*,<doc src="web">\n<p>\n\0\n</p>\n</doc>,' | python3 tools/unitok_4 tools/unitok_czech_slovak.py |./lang_filter_no_text_size_limit_allow_unknown.py Czech src_langfilter/wl/Czech_cstenten17_mj2.frqwl.gz Slovak src_langfilter/wl/Slovak_sktenten11_rft1.frqwl.gz English src_langfilter/wl/English_ententen15_tt31.frqwl.gz ALL $@_rejected NONE | grep -v '^<par_langs' | python3 tools/vert2plain.py -l 0 | grep -v -P '^(?:</doc|</?p)' | perl -pe 'if (/^<doc /) {s,.* lang="([^"]+)".*\n,$$1\t,}' > $@
	rm $@_rejected*

#== EVALUATE ==
eval/%: gold_data.txt out/%
	./evaluate.py $^ $@.errors > $@

eval_summary: eval/langid eval/langdetect eval/langfilter
	rm -f $@
	for f in $^; do echo "=== $$f ===" >> $@; cat $$f >> $@; echo >> $@; done
	echo -e "\n\nEvaluation:"
	cat $@


clean:
	rm -rf src_langfilter src_langid src_langdetect out/* eval/* tools/__pycache__

.PHONY: all clean
