Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,7 @@
data
fasttext
result
build/
python/fasttext.egg-info/
python/fasttext_pybind.cpython-37m-x86_64-linux-gnu.so

5 changes: 3 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,12 @@
#

CXX = c++
CXXFLAGS = -pthread -std=c++0x -march=native
CXXFLAGS = -Wall -pthread -std=c++14 -march=native -ffast-math -Wsuggest-final-methods -Wsuggest-override -Wodr -flto -ftree-loop-linear -floop-strip-mine -floop-block

OBJS = args.o matrix.o dictionary.o loss.o productquantizer.o densematrix.o quantmatrix.o vector.o model.o utils.o meter.o fasttext.o
INCLUDES = -I.

opt: CXXFLAGS += -O3 -funroll-loops
opt: CXXFLAGS += -O3 -funroll-loops -DNDEBUG
opt: fasttext

coverage: CXXFLAGS += -O0 -fno-inline -fprofile-arcs --coverage
Expand Down
19 changes: 18 additions & 1 deletion python/fastText/FastText.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,9 +139,26 @@ def check(entry):
text = check(text)
predictions = self.f.predict(text, k, threshold, on_unicode_error)
probs, labels = zip(*predictions)

return labels, np.array(probs, copy=False)

def predict_all(self, text, on_unicode_error='strict'):
def check(entry):
if entry.find('\n') != -1:
raise ValueError(
"predict processes one line at a time (remove \'\\n\')"
)
entry += "\n"
return entry

if type(text) is list:
text = [check(entry) for entry in text]
predictions = self.f.multilinePredictAll(text)
return np.array(predictions, dtype=float)
else:
text = check(text)
probs = self.f.predictAll(text)
return np.array(probs, copy=False)

def get_input_matrix(self):
"""
Get a copy of the full input matrix of a Model. This only
Expand Down
142 changes: 100 additions & 42 deletions python/fastText/pybind/fasttext_pybind.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include <vector.h>
#include <cmath>
#include <iterator>
#include <numeric>
#include <sstream>
#include <stdexcept>

Expand All @@ -39,7 +40,7 @@ py::str castToPythonString(const std::string& s, const char* onUnicodeError) {

std::pair<std::vector<py::str>, std::vector<py::str>> getLineText(
fasttext::FastText& m,
const std::string text,
const std::string& text,
const char* onUnicodeError) {
std::shared_ptr<const fasttext::Dictionary> d = m.getDictionary();
std::stringstream ioss(text);
Expand All @@ -60,7 +61,7 @@ std::pair<std::vector<py::str>, std::vector<py::str>> getLineText(
if (token == fasttext::Dictionary::EOS)
break;
}
return std::pair<std::vector<py::str>, std::vector<py::str>>(words, labels);
return {std::move(words), std::move(labels)};
}

PYBIND11_MODULE(fasttext_pybind, m) {
Expand Down Expand Up @@ -159,13 +160,13 @@ PYBIND11_MODULE(fasttext_pybind, m) {
})
.def(
"loadModel",
[](fasttext::FastText& m, std::string s) { m.loadModel(s); })
[](fasttext::FastText& m, const std::string& s) { m.loadModel(s); })
.def(
"saveModel",
[](fasttext::FastText& m, std::string s) { m.saveModel(s); })
[](fasttext::FastText& m, const std::string& s) { m.saveModel(s); })
.def(
"test",
[](fasttext::FastText& m, const std::string filename, int32_t k) {
[](fasttext::FastText& m, const std::string& filename, int32_t k) {
std::ifstream ifs(filename);
if (!ifs.is_open()) {
throw std::invalid_argument("Test file cannot be opened!");
Expand All @@ -180,13 +181,13 @@ PYBIND11_MODULE(fasttext_pybind, m) {
"getSentenceVector",
[](fasttext::FastText& m,
fasttext::Vector& v,
const std::string text) {
const std::string& text) {
std::stringstream ioss(text);
m.getSentenceVector(ioss, v);
})
.def(
"tokenize",
[](fasttext::FastText& m, const std::string text) {
[](fasttext::FastText& m, const std::string& text) {
std::vector<std::string> text_split;
std::shared_ptr<const fasttext::Dictionary> d = m.getDictionary();
std::stringstream ioss(text);
Expand All @@ -202,53 +203,57 @@ PYBIND11_MODULE(fasttext_pybind, m) {
.def(
"multilineGetLine",
[](fasttext::FastText& m,
const std::vector<std::string> lines,
const std::vector<std::string>& lines,
const char* onUnicodeError) {
std::shared_ptr<const fasttext::Dictionary> d = m.getDictionary();

std::vector<std::vector<py::str>> all_words;
all_words.reserve(lines.size());
std::vector<std::vector<py::str>> all_labels;
all_labels.reserve(lines.size());

for (const auto& text : lines) {
auto pair = getLineText(m, text, onUnicodeError);
all_words.push_back(pair.first);
all_labels.push_back(pair.second);
all_words.push_back(std::move(pair.first));
all_labels.push_back(std::move(pair.second));
}
return std::pair<
std::vector<std::vector<py::str>>,
std::vector<std::vector<py::str>>>(all_words, all_labels);
std::vector<std::vector<py::str>>>(std::move(all_words), std::move(all_labels));
})
.def(
"getVocab",
[](fasttext::FastText& m, const char* onUnicodeError) {
py::str s;
std::vector<py::str> vocab_list;
std::vector<int64_t> vocab_freq;
std::shared_ptr<const fasttext::Dictionary> d = m.getDictionary();
vocab_freq = d->getCounts(fasttext::entry_type::word);
std::vector<int64_t> vocab_freq = d->getCounts(fasttext::entry_type::word);
std::vector<py::str> vocab_list;
vocab_list.reserve(vocab_freq.size());
for (int32_t i = 0; i < vocab_freq.size(); i++) {
vocab_list.push_back(
castToPythonString(d->getWord(i), onUnicodeError));
vocab_list.push_back(castToPythonString(d->getWord(i), onUnicodeError));
}
return std::pair<std::vector<py::str>, std::vector<int64_t>>(
vocab_list, vocab_freq);
std::move(vocab_list), std::move(vocab_freq));
})
.def(
"getLabels",
[](fasttext::FastText& m, const char* onUnicodeError) {
std::vector<py::str> labels_list;
std::vector<int64_t> labels_freq;
std::shared_ptr<const fasttext::Dictionary> d = m.getDictionary();
labels_freq = d->getCounts(fasttext::entry_type::label);
std::vector<int64_t> labels_freq = d->getCounts(fasttext::entry_type::label);
std::vector<py::str> labels_list;
labels_list.reserve(labels_freq.size());

for (int32_t i = 0; i < labels_freq.size(); i++) {
labels_list.push_back(
castToPythonString(d->getLabel(i), onUnicodeError));
}
return std::pair<std::vector<py::str>, std::vector<int64_t>>(
labels_list, labels_freq);
std::move(labels_list), std::move(labels_freq));
})
.def(
"quantize",
[](fasttext::FastText& m,
const std::string input,
const std::string& input,
bool qout,
int32_t cutoff,
bool retrain,
Expand Down Expand Up @@ -276,26 +281,48 @@ PYBIND11_MODULE(fasttext_pybind, m) {
// NOTE: text needs to end in a newline
// to exactly mimic the behavior of the cli
[](fasttext::FastText& m,
const std::string text,
const std::string& text,
int32_t k,
fasttext::real threshold,
const char* onUnicodeError) {
std::stringstream ioss(text);
std::vector<std::pair<fasttext::real, std::string>> predictions;
m.predictLine(ioss, predictions, k, threshold);

std::vector<std::pair<fasttext::real, py::str>>
transformedPredictions;
std::vector<std::pair<fasttext::real, py::str>> transformedPredictions;
transformedPredictions.reserve(predictions.size());

for (const auto& prediction : predictions) {
transformedPredictions.push_back(std::make_pair(
transformedPredictions.emplace_back(
prediction.first,
castToPythonString(prediction.second, onUnicodeError)));
castToPythonString(prediction.second, onUnicodeError)
);
}

return transformedPredictions;
})
.def(
"predictAll",
// NOTE: text needs to end in a newline
// to exactly mimic the behavior of the cli
[](fasttext::FastText& m, const std::string& text) {
std::stringstream ioss(text);
std::vector<std::pair<fasttext::real, std::string>> predictions;

m.predictLine(ioss, predictions);
std::sort(std::begin(predictions), std::end(predictions), [](const auto& x, const auto &y) {
return x.second < y.second;
});

std::vector<fasttext::real> transformedPredictions;
transformedPredictions.reserve(predictions.size());

std::transform(std::begin(predictions), std::end(predictions), std::back_inserter(transformedPredictions), [](const auto& x) {
return x.first;
});
return transformedPredictions;
})
.def(
"multilinePredict",
// NOTE: text needs to end in a newline
// to exactly mimic the behavior of the cli
Expand All @@ -306,26 +333,56 @@ PYBIND11_MODULE(fasttext_pybind, m) {
const char* onUnicodeError) {
std::vector<std::vector<std::pair<fasttext::real, py::str>>>
allPredictions;
allPredictions.reserve(lines.size());
std::vector<std::pair<fasttext::real, std::string>> predictions;

for (const std::string& text : lines) {
std::stringstream ioss(text);
std::stringstream ioss(text); /// stringstream is slow
m.predictLine(ioss, predictions, k, threshold);
std::vector<std::pair<fasttext::real, py::str>>
transformedPredictions;
std::vector<std::pair<fasttext::real, py::str>> transformedPredictions;
transformedPredictions.reserve(predictions.size());
for (const auto& prediction : predictions) {
transformedPredictions.push_back(std::make_pair(
transformedPredictions.emplace_back(
prediction.first,
castToPythonString(prediction.second, onUnicodeError)));
castToPythonString(prediction.second, onUnicodeError)
);
}
allPredictions.push_back(transformedPredictions);
allPredictions.push_back(std::move(transformedPredictions));
}
return allPredictions;
})
.def(
"multilinePredictAll",
// NOTE: text needs to end in a newline
// to exactly mimic the behavior of the cli
[](fasttext::FastText& m, const std::vector<std::string>& lines) {
std::vector<std::vector<fasttext::real>> allPredictions;

allPredictions.reserve(lines.size());
std::vector<std::pair<fasttext::real, std::string>> predictions;
for (const std::string& text : lines) {
std::stringstream ioss(text); /// stringstream is slow
m.predictLine(ioss, predictions);

std::sort(std::begin(predictions), std::end(predictions), [](const auto& x, const auto &y) {
return x.second < y.second;
});

std::vector<fasttext::real> transformedPredictions;
transformedPredictions.reserve(predictions.size());

std::transform(std::begin(predictions), std::end(predictions), std::back_inserter(transformedPredictions), [](const auto& x) {
return x.first;
});

allPredictions.push_back(std::move(transformedPredictions));
}
return allPredictions;
})
.def(
"testLabel",
[](fasttext::FastText& m,
const std::string filename,
const std::string& filename,
int32_t k,
fasttext::real threshold) {
std::ifstream ifs(filename);
Expand All @@ -335,7 +392,7 @@ PYBIND11_MODULE(fasttext_pybind, m) {
fasttext::Meter meter;
m.test(ifs, k, threshold, meter);
std::shared_ptr<const fasttext::Dictionary> d = m.getDictionary();
std::unordered_map<std::string, py::dict> returnedValue;
std::unordered_map<std::string, py::dict> returnedValue(d->nlabels());
for (int32_t i = 0; i < d->nlabels(); i++) {
returnedValue[d->getLabel(i)] = py::dict(
"precision"_a = meter.precision(i),
Expand All @@ -347,12 +404,12 @@ PYBIND11_MODULE(fasttext_pybind, m) {
})
.def(
"getWordId",
[](fasttext::FastText& m, const std::string word) {
[](fasttext::FastText& m, const std::string& word) {
return m.getWordId(word);
})
.def(
"getSubwordId",
[](fasttext::FastText& m, const std::string word) {
[](fasttext::FastText& m, const std::string& word) {
return m.getSubwordId(word);
})
.def(
Expand All @@ -364,25 +421,26 @@ PYBIND11_MODULE(fasttext_pybind, m) {
"getWordVector",
[](fasttext::FastText& m,
fasttext::Vector& vec,
const std::string word) { m.getWordVector(vec, word); })
const std::string& word) { m.getWordVector(vec, word); })
.def(
"getSubwords",
[](fasttext::FastText& m,
const std::string word,
const std::string& word,
const char* onUnicodeError) {
std::vector<std::string> subwords;
std::vector<int32_t> ngrams;
std::shared_ptr<const fasttext::Dictionary> d = m.getDictionary();
d->getSubwords(word, ngrams, subwords);

std::vector<py::str> transformedSubwords;
transformedSubwords.reserve(subwords.size());

for (const auto& subword : subwords) {
transformedSubwords.push_back(
castToPythonString(subword, onUnicodeError));
transformedSubwords.push_back(castToPythonString(subword, onUnicodeError));
}

return std::pair<std::vector<py::str>, std::vector<int32_t>>(
transformedSubwords, ngrams);
std::move(transformedSubwords), std::move(ngrams));
})
.def("isQuant", [](fasttext::FastText& m) { return m.isQuant(); });
}
14 changes: 12 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,10 @@ def __str__(self):
map(lambda x: str(os.path.join(FASTTEXT_SRC, x)), fasttext_src_cc)
)

extra_compile_args = " -march=native -ffast-math -Wsuggest-final-methods" \
" -Wsuggest-override -Wodr -flto -ftree-loop-linear" \
" -floop-strip-mine -floop-block "

ext_modules = [
Extension(
str('fasttext_pybind'),
Expand All @@ -74,8 +78,8 @@ def __str__(self):
FASTTEXT_SRC,
],
language='c++',
extra_compile_args=["-O0 -fno-inline -fprofile-arcs -pthread -march=native" if coverage else
"-O3 -funroll-loops -pthread -march=native"],
extra_compile_args=[("-O0 -fno-inline -fprofile-arcs -pthread -march=native" if coverage else
"-O3 -funroll-loops -pthread -march=native") + extra_compile_args],
),
]

Expand All @@ -100,6 +104,7 @@ def cpp_flag(compiler):
"""Return the -std=c++[0x/11/14] compiler flag.
The c++14 is preferred over c++0x/11 (when it is available).
"""
return '-std=c++14'
standards = ['-std=c++14', '-std=c++11', '-std=c++0x']
for standard in standards:
if has_flag(compiler, [standard]):
Expand Down Expand Up @@ -134,6 +139,11 @@ def build_extensions(self):
ct = self.compiler.compiler_type
opts = self.c_opts.get(ct, [])
extra_link_args = []
self.c_opts['unix'] += [
"-flto", "-march=native", "-ffast-math", "-Wsuggest-final-methods",
"-Wsuggest-override", "-Wodr", "-ftree-loop-linear",
"-floop-strip-mine", "-floop-block", "-O3", "-DNDEBUG",
]

if coverage:
coverage_option = '--coverage'
Expand Down
Loading