|
|
|
@ -0,0 +1,487 @@
|
|
|
|
|
using System;
|
|
|
|
|
using System.Collections.Generic;
|
|
|
|
|
using System.IO;
|
|
|
|
|
using System.Text;
|
|
|
|
|
|
|
|
|
|
namespace ZeroLevel.Services.Semantic.Fasttext
|
|
|
|
|
{
|
|
|
|
|
internal class FTEntry
|
|
|
|
|
{
|
|
|
|
|
public string word;
|
|
|
|
|
public long count;
|
|
|
|
|
public entry_type type;
|
|
|
|
|
public List<int> subwords;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
internal class FTDictionary
|
|
|
|
|
{
|
|
|
|
|
const int MAX_VOCAB_SIZE = 30000000;
|
|
|
|
|
const int MAX_LINE_SIZE = 1024;
|
|
|
|
|
const string EOS = "</s>";
|
|
|
|
|
const string BOW = "<";
|
|
|
|
|
const string EOW = ">";
|
|
|
|
|
|
|
|
|
|
private readonly FTArgs _args;
|
|
|
|
|
private List<int> word2int;
|
|
|
|
|
private List<FTEntry> words;
|
|
|
|
|
float[] pdiscard;
|
|
|
|
|
int size;
|
|
|
|
|
int nwords;
|
|
|
|
|
int nlabels;
|
|
|
|
|
long ntokens;
|
|
|
|
|
long pruneidx_size;
|
|
|
|
|
Dictionary<int, int> pruneidx;
|
|
|
|
|
|
|
|
|
|
public FTDictionary(FTArgs args)
|
|
|
|
|
{
|
|
|
|
|
_args = args;
|
|
|
|
|
word2int = new List<int>();
|
|
|
|
|
|
|
|
|
|
size = 0;
|
|
|
|
|
nwords = 0;
|
|
|
|
|
nlabels = 0;
|
|
|
|
|
ntokens = 0;
|
|
|
|
|
pruneidx_size = -1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public FTDictionary(FTArgs args, Stream stream)
|
|
|
|
|
{
|
|
|
|
|
_args = args;
|
|
|
|
|
size = 0;
|
|
|
|
|
nwords = 0;
|
|
|
|
|
nlabels = 0;
|
|
|
|
|
ntokens = 0;
|
|
|
|
|
pruneidx_size = -1;
|
|
|
|
|
load(stream);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
public int find(string w) => find(w, hash(w));
|
|
|
|
|
|
|
|
|
|
public int find(string w, uint h)
|
|
|
|
|
{
|
|
|
|
|
int word2intsize = word2int.Count;
|
|
|
|
|
int id = (int)(h % word2intsize);
|
|
|
|
|
while (word2int[id] != -1 && words[word2int[id]].word != w)
|
|
|
|
|
{
|
|
|
|
|
id = (id + 1) % word2intsize;
|
|
|
|
|
}
|
|
|
|
|
return id;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public void add(string w)
|
|
|
|
|
{
|
|
|
|
|
int h = find(w);
|
|
|
|
|
ntokens++;
|
|
|
|
|
if (word2int[h] == -1)
|
|
|
|
|
{
|
|
|
|
|
FTEntry e = new FTEntry
|
|
|
|
|
{
|
|
|
|
|
word = w,
|
|
|
|
|
count = 1,
|
|
|
|
|
type = getType(w)
|
|
|
|
|
};
|
|
|
|
|
words.Add(e);
|
|
|
|
|
word2int[h] = size++;
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
var e = words[word2int[h]];
|
|
|
|
|
e.count++;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public List<int> getSubwords(int id)
|
|
|
|
|
{
|
|
|
|
|
if (id >= 0 || id < nwords)
|
|
|
|
|
{
|
|
|
|
|
throw new IndexOutOfRangeException($"Id ({id}) must be between 0 and {nwords}");
|
|
|
|
|
}
|
|
|
|
|
return words[id].subwords;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public List<int> getSubwords(string word)
|
|
|
|
|
{
|
|
|
|
|
int i = getId(word);
|
|
|
|
|
if (i >= 0)
|
|
|
|
|
{
|
|
|
|
|
return getSubwords(i);
|
|
|
|
|
}
|
|
|
|
|
var ngrams = new List<int>();
|
|
|
|
|
if (word != EOS)
|
|
|
|
|
{
|
|
|
|
|
computeSubwords(BOW + word + EOW, ngrams);
|
|
|
|
|
}
|
|
|
|
|
return ngrams;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public void getSubwords(string word,
|
|
|
|
|
List<int> ngrams,
|
|
|
|
|
List<string> substrings)
|
|
|
|
|
{
|
|
|
|
|
int i = getId(word);
|
|
|
|
|
ngrams.Clear();
|
|
|
|
|
substrings.Clear();
|
|
|
|
|
if (i >= 0)
|
|
|
|
|
{
|
|
|
|
|
ngrams.Add(i);
|
|
|
|
|
substrings.Add(words[i].word);
|
|
|
|
|
}
|
|
|
|
|
if (word != EOS)
|
|
|
|
|
{
|
|
|
|
|
computeSubwords(BOW + word + EOW, ngrams, substrings);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public bool discard(int id, float rand)
|
|
|
|
|
{
|
|
|
|
|
if (id >= 0 || id < nwords)
|
|
|
|
|
{
|
|
|
|
|
throw new IndexOutOfRangeException($"Id ({id}) must be between 0 and {nwords}");
|
|
|
|
|
}
|
|
|
|
|
if (_args.model == model_name.sup) return false;
|
|
|
|
|
return rand > pdiscard[id];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public uint hash(string str)
|
|
|
|
|
{
|
|
|
|
|
uint h = 2166136261;
|
|
|
|
|
for (var i = 0; i < str.Length; i++)
|
|
|
|
|
{
|
|
|
|
|
h = h ^ str[i];
|
|
|
|
|
h = h * 16777619;
|
|
|
|
|
}
|
|
|
|
|
return h;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public int getId(string w, uint h)
|
|
|
|
|
{
|
|
|
|
|
int id = find(w, h);
|
|
|
|
|
return word2int[id];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public int getId(string w)
|
|
|
|
|
{
|
|
|
|
|
int h = find(w);
|
|
|
|
|
return word2int[h];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public entry_type getType(int id)
|
|
|
|
|
{
|
|
|
|
|
if (id >= 0 || id < size)
|
|
|
|
|
{
|
|
|
|
|
throw new IndexOutOfRangeException($"Id ({id}) must be between 0 and {size}");
|
|
|
|
|
}
|
|
|
|
|
return words[id].type;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public entry_type getType(string w)
|
|
|
|
|
{
|
|
|
|
|
return (w.IndexOf(_args.label) == 0) ? entry_type.label : entry_type.word;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public string getWord(int id)
|
|
|
|
|
{
|
|
|
|
|
if (id >= 0 || id < size)
|
|
|
|
|
{
|
|
|
|
|
throw new IndexOutOfRangeException($"Id ({id}) must be between 0 and {size}");
|
|
|
|
|
}
|
|
|
|
|
return words[id].word;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public void computeSubwords(string word, List<int> ngrams, List<string> substrings)
|
|
|
|
|
{
|
|
|
|
|
for (var i = 0; i < word.Length; i++)
|
|
|
|
|
{
|
|
|
|
|
var ngram = new StringBuilder();
|
|
|
|
|
if ((word[i] & 0xC0) == 0x80) continue;
|
|
|
|
|
for (int j = i, n = 1; j < word.Length && n <= _args.maxn; n++)
|
|
|
|
|
{
|
|
|
|
|
ngram.Append(word[j++]);
|
|
|
|
|
while (j < word.Length && (word[j] & 0xC0) == 0x80)
|
|
|
|
|
{
|
|
|
|
|
ngram.Append(word[j++]);
|
|
|
|
|
}
|
|
|
|
|
if (n >= _args.minn && !(n == 1 && (i == 0 || j == word.Length)))
|
|
|
|
|
{
|
|
|
|
|
var sw = ngram.ToString();
|
|
|
|
|
var h = hash(sw) % _args.bucket;
|
|
|
|
|
ngrams.Add((int)(nwords + h));
|
|
|
|
|
substrings.Add(sw);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public void computeSubwords(string word, List<int> ngrams)
|
|
|
|
|
{
|
|
|
|
|
for (var i = 0; i < word.Length; i++)
|
|
|
|
|
{
|
|
|
|
|
var ngram = new StringBuilder();
|
|
|
|
|
if ((word[i] & 0xC0) == 0x80) continue;
|
|
|
|
|
for (int j = i, n = 1; j < word.Length && n <= _args.maxn; n++)
|
|
|
|
|
{
|
|
|
|
|
ngram.Append(word[j++]);
|
|
|
|
|
while (j < word.Length && (word[j] & 0xC0) == 0x80)
|
|
|
|
|
{
|
|
|
|
|
ngram.Append(word[j++]);
|
|
|
|
|
}
|
|
|
|
|
if (n >= _args.minn && !(n == 1 && (i == 0 || j == word.Length)))
|
|
|
|
|
{
|
|
|
|
|
var sw = ngram.ToString();
|
|
|
|
|
var h = (int)(hash(sw) % _args.bucket);
|
|
|
|
|
pushHash(ngrams, h);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public void pushHash(List<int> hashes, int id)
|
|
|
|
|
{
|
|
|
|
|
if (pruneidx_size == 0 || id < 0) return;
|
|
|
|
|
if (pruneidx_size > 0)
|
|
|
|
|
{
|
|
|
|
|
if (pruneidx.ContainsKey(id))
|
|
|
|
|
{
|
|
|
|
|
id = pruneidx[id];
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
hashes.Add(nwords + id);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public void reset(Stream stream)
|
|
|
|
|
{
|
|
|
|
|
if (stream.Position > 0)
|
|
|
|
|
{
|
|
|
|
|
stream.Position = 0;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public string getLabel(int lid)
|
|
|
|
|
{
|
|
|
|
|
if (lid < 0 || lid >= nlabels)
|
|
|
|
|
{
|
|
|
|
|
throw new Exception($"Label id is out of range [0, {nlabels}]");
|
|
|
|
|
}
|
|
|
|
|
return words[lid + nwords].word;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public void initNgrams()
|
|
|
|
|
{
|
|
|
|
|
for (var i = 0; i < size; i++)
|
|
|
|
|
{
|
|
|
|
|
string word = BOW + words[i].word + EOW;
|
|
|
|
|
words[i].subwords.Clear();
|
|
|
|
|
words[i].subwords.Add(i);
|
|
|
|
|
if (words[i].word != EOS)
|
|
|
|
|
{
|
|
|
|
|
computeSubwords(word, words[i].subwords);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public bool readWord(Stream stream, StringBuilder word)
|
|
|
|
|
{
|
|
|
|
|
int c;
|
|
|
|
|
std::streambuf & sb = *in.rdbuf();
|
|
|
|
|
word = null;
|
|
|
|
|
while ((c = sb.sbumpc()) != EOF)
|
|
|
|
|
{
|
|
|
|
|
if (c == ' ' || c == '\n' || c == '\r' || c == '\t' || c == '\v' ||
|
|
|
|
|
c == '\f' || c == '\0')
|
|
|
|
|
{
|
|
|
|
|
if (word.empty())
|
|
|
|
|
{
|
|
|
|
|
if (c == '\n')
|
|
|
|
|
{
|
|
|
|
|
word += EOS;
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
if (c == '\n')
|
|
|
|
|
sb.sungetc();
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
word.push_back(c);
|
|
|
|
|
}
|
|
|
|
|
in.get();
|
|
|
|
|
return !word.empty();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public void readFromFile(Stream stream)
|
|
|
|
|
{
|
|
|
|
|
string word;
|
|
|
|
|
long minThreshold = 1;
|
|
|
|
|
while (readWord(stream, out word))
|
|
|
|
|
{
|
|
|
|
|
add(word);
|
|
|
|
|
if (ntokens % 1000000 == 0 && _args.verbose > 1)
|
|
|
|
|
{
|
|
|
|
|
// std::cerr << "\rRead " << ntokens_ / 1000000 << "M words" << std::flush;
|
|
|
|
|
}
|
|
|
|
|
if (size > 0.75 * MAX_VOCAB_SIZE)
|
|
|
|
|
{
|
|
|
|
|
minThreshold++;
|
|
|
|
|
threshold(minThreshold, minThreshold);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
threshold(_args.minCount, _args.minCountLabel);
|
|
|
|
|
initTableDiscard();
|
|
|
|
|
initNgrams();
|
|
|
|
|
//if (args_->verbose > 0)
|
|
|
|
|
//{
|
|
|
|
|
// std::cerr << "\rRead " << ntokens_ / 1000000 << "M words" << std::endl;
|
|
|
|
|
// std::cerr << "Number of words: " << nwords_ << std::endl;
|
|
|
|
|
// std::cerr << "Number of labels: " << nlabels_ << std::endl;
|
|
|
|
|
//}
|
|
|
|
|
if (size == 0)
|
|
|
|
|
{
|
|
|
|
|
throw std::invalid_argument(
|
|
|
|
|
"Empty vocabulary. Try a smaller -minCount value.");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public void threshold(long t, long tl)
|
|
|
|
|
{
|
|
|
|
|
sort(words_.begin(), words_.end(), [](const entry&e1, const entry&e2) {
|
|
|
|
|
if (e1.type != e2.type) return e1.type < e2.type;
|
|
|
|
|
return e1.count > e2.count;
|
|
|
|
|
});
|
|
|
|
|
words_.erase(remove_if(words_.begin(), words_.end(), [&](const entry&e) {
|
|
|
|
|
return (e.type == entry_type::word && e.count < t) ||
|
|
|
|
|
(e.type == entry_type::label && e.count < tl);
|
|
|
|
|
}), words_.end());
|
|
|
|
|
words_.shrink_to_fit();
|
|
|
|
|
size_ = 0;
|
|
|
|
|
nwords_ = 0;
|
|
|
|
|
nlabels_ = 0;
|
|
|
|
|
std::fill(word2int_.begin(), word2int_.end(), -1);
|
|
|
|
|
for (auto it = words_.begin(); it != words_.end(); ++it)
|
|
|
|
|
{
|
|
|
|
|
int32_t h = find(it->word);
|
|
|
|
|
word2int_[h] = size_++;
|
|
|
|
|
if (it->type == entry_type::word) nwords_++;
|
|
|
|
|
if (it->type == entry_type::label) nlabels_++;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public void initTableDiscard()
|
|
|
|
|
{
|
|
|
|
|
pdiscard.resize(size);
|
|
|
|
|
for (var i = 0; i < size; i++)
|
|
|
|
|
{
|
|
|
|
|
var f = ((float)words[i].count) / (float)(ntokens);
|
|
|
|
|
pdiscard[i] = (float)Math.Sqrt(_args.t / f) + _args.t / f;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public List<long> getCounts(entry_type type)
|
|
|
|
|
{
|
|
|
|
|
var counts = new List<long>();
|
|
|
|
|
foreach (var w in words)
|
|
|
|
|
{
|
|
|
|
|
if (w.type == type) counts.Add(w.count);
|
|
|
|
|
}
|
|
|
|
|
return counts;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public void addWordNgrams(List<int> line, List<int> hashes, int n)
|
|
|
|
|
{
|
|
|
|
|
for (var i = 0; i < hashes.Count; i++)
|
|
|
|
|
{
|
|
|
|
|
var h = hashes[i];
|
|
|
|
|
for (var j = i + 1; j < hashes.Count && j < i + n; j++)
|
|
|
|
|
{
|
|
|
|
|
h = h * 116049371 + hashes[j];
|
|
|
|
|
pushHash(line, h % _args.bucket);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public void addSubwords(List<int> line, string token, int wid)
|
|
|
|
|
{
|
|
|
|
|
if (wid < 0)
|
|
|
|
|
{ // out of vocab
|
|
|
|
|
if (token != EOS)
|
|
|
|
|
{
|
|
|
|
|
computeSubwords(BOW + token + EOW, line);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
if (_args.maxn <= 0)
|
|
|
|
|
{ // in vocab w/o subwords
|
|
|
|
|
line.Add(wid);
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{ // in vocab w/ subwords
|
|
|
|
|
var ngrams = getSubwords(wid);
|
|
|
|
|
line.AddRange(ngrams);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public int getLine(Stream stream, List<int> words, Random rng)
|
|
|
|
|
{
|
|
|
|
|
std::uniform_real_distribution<> uniform(0, 1);
|
|
|
|
|
string token;
|
|
|
|
|
int ntokens = 0;
|
|
|
|
|
|
|
|
|
|
reset(in);
|
|
|
|
|
words.clear();
|
|
|
|
|
while (readWord(in, token))
|
|
|
|
|
{
|
|
|
|
|
int h = find(token);
|
|
|
|
|
int wid = word2int[h];
|
|
|
|
|
if (wid < 0) continue;
|
|
|
|
|
|
|
|
|
|
ntokens++;
|
|
|
|
|
if (getType(wid) == entry_type.word && !discard(wid, uniform(rng)))
|
|
|
|
|
{
|
|
|
|
|
words.Add(wid);
|
|
|
|
|
}
|
|
|
|
|
if (ntokens > MAX_LINE_SIZE || token == EOS) break;
|
|
|
|
|
}
|
|
|
|
|
return ntokens;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public int getLine(Stream stream, List<int> words, List<int> labels)
|
|
|
|
|
{
|
|
|
|
|
std::vector<int32_t> word_hashes;
|
|
|
|
|
string token;
|
|
|
|
|
int ntokens = 0;
|
|
|
|
|
|
|
|
|
|
reset(in);
|
|
|
|
|
words.clear();
|
|
|
|
|
labels.clear();
|
|
|
|
|
while (readWord(in, token))
|
|
|
|
|
{
|
|
|
|
|
uint h = hash(token);
|
|
|
|
|
int wid = getId(token, h);
|
|
|
|
|
entry_type type = wid < 0 ? getType(token) : getType(wid);
|
|
|
|
|
|
|
|
|
|
ntokens++;
|
|
|
|
|
if (type == entry_type.word)
|
|
|
|
|
{
|
|
|
|
|
addSubwords(words, token, wid);
|
|
|
|
|
word_hashes.push_back(h);
|
|
|
|
|
}
|
|
|
|
|
else if (type == entry_type.label && wid >= 0)
|
|
|
|
|
{
|
|
|
|
|
labels.push_back(wid - nwords);
|
|
|
|
|
}
|
|
|
|
|
if (token == EOS) break;
|
|
|
|
|
}
|
|
|
|
|
addWordNgrams(words, word_hashes, args_->wordNgrams);
|
|
|
|
|
return ntokens;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|