|
|
@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
namespace ZeroLevel.Services.Semantic.Fasttext
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
public class FTArgs
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
#region Args
|
|
|
|
|
|
|
|
public double lr;
|
|
|
|
|
|
|
|
public int lrUpdateRate;
|
|
|
|
|
|
|
|
public int dim;
|
|
|
|
|
|
|
|
public int ws;
|
|
|
|
|
|
|
|
public int epoch;
|
|
|
|
|
|
|
|
public int minCount;
|
|
|
|
|
|
|
|
public int minCountLabel;
|
|
|
|
|
|
|
|
public int neg;
|
|
|
|
|
|
|
|
public int wordNgrams;
|
|
|
|
|
|
|
|
public loss_name loss;
|
|
|
|
|
|
|
|
public model_name model;
|
|
|
|
|
|
|
|
public int bucket;
|
|
|
|
|
|
|
|
public int minn;
|
|
|
|
|
|
|
|
public int maxn;
|
|
|
|
|
|
|
|
public int thread;
|
|
|
|
|
|
|
|
public double t;
|
|
|
|
|
|
|
|
public string label;
|
|
|
|
|
|
|
|
public int verbose;
|
|
|
|
|
|
|
|
public string pretrainedVectors;
|
|
|
|
|
|
|
|
public bool saveOutput;
|
|
|
|
|
|
|
|
public bool qout;
|
|
|
|
|
|
|
|
public bool retrain;
|
|
|
|
|
|
|
|
public bool qnorm;
|
|
|
|
|
|
|
|
public ulong cutoff;
|
|
|
|
|
|
|
|
public ulong dsub;
|
|
|
|
|
|
|
|
#endregion
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
public FTArgs()
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
lr = 0.05;
|
|
|
|
|
|
|
|
dim = 100;
|
|
|
|
|
|
|
|
ws = 5;
|
|
|
|
|
|
|
|
epoch = 5;
|
|
|
|
|
|
|
|
minCount = 5;
|
|
|
|
|
|
|
|
minCountLabel = 0;
|
|
|
|
|
|
|
|
neg = 5;
|
|
|
|
|
|
|
|
wordNgrams = 1;
|
|
|
|
|
|
|
|
loss = loss_name.ns;
|
|
|
|
|
|
|
|
model = model_name.sg;
|
|
|
|
|
|
|
|
bucket = 2000000;
|
|
|
|
|
|
|
|
minn = 3;
|
|
|
|
|
|
|
|
maxn = 6;
|
|
|
|
|
|
|
|
thread = 12;
|
|
|
|
|
|
|
|
lrUpdateRate = 100;
|
|
|
|
|
|
|
|
t = 1e-4;
|
|
|
|
|
|
|
|
label = "__label__";
|
|
|
|
|
|
|
|
verbose = 2;
|
|
|
|
|
|
|
|
pretrainedVectors = "";
|
|
|
|
|
|
|
|
saveOutput = false;
|
|
|
|
|
|
|
|
qout = false;
|
|
|
|
|
|
|
|
retrain = false;
|
|
|
|
|
|
|
|
qnorm = false;
|
|
|
|
|
|
|
|
cutoff = 0;
|
|
|
|
|
|
|
|
dsub = 2;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
protected string lossToString(loss_name ln)
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
switch (ln)
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
case loss_name.hs:
|
|
|
|
|
|
|
|
return "hs";
|
|
|
|
|
|
|
|
case loss_name.ns:
|
|
|
|
|
|
|
|
return "ns";
|
|
|
|
|
|
|
|
case loss_name.softmax:
|
|
|
|
|
|
|
|
return "softmax";
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
return "Unknown loss!"; // should never happen
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
protected string boolToString(bool b)
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
if (b)
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
return "true";
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
else
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
return "false";
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
protected string modelToString(model_name mn)
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
switch (mn)
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
case model_name.cbow:
|
|
|
|
|
|
|
|
return "cbow";
|
|
|
|
|
|
|
|
case model_name.sg:
|
|
|
|
|
|
|
|
return "sg";
|
|
|
|
|
|
|
|
case model_name.sup:
|
|
|
|
|
|
|
|
return "sup";
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
return "Unknown model name!"; // should never happen
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#region Help
|
|
|
|
|
|
|
|
public string printHelp()
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
printBasicHelp() +
|
|
|
|
|
|
|
|
printDictionaryHelp() +
|
|
|
|
|
|
|
|
printTrainingHelp() +
|
|
|
|
|
|
|
|
printQuantizationHelp();
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
private string printBasicHelp()
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
return "\nThe following arguments are mandatory:\n" +
|
|
|
|
|
|
|
|
" -input training file path\n" +
|
|
|
|
|
|
|
|
" -output output file path\n" +
|
|
|
|
|
|
|
|
"\nThe following arguments are optional:\n" +
|
|
|
|
|
|
|
|
" -verbose verbosity level [" + verbose + "]\n";
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
private string printDictionaryHelp()
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
"\nThe following arguments for the dictionary are optional:\n" +
|
|
|
|
|
|
|
|
" -minCount minimal number of word occurences [" + minCount + "]\n" +
|
|
|
|
|
|
|
|
" -minCountLabel minimal number of label occurences [" + minCountLabel + "]\n" +
|
|
|
|
|
|
|
|
" -wordNgrams max length of word ngram [" + wordNgrams + "]\n" +
|
|
|
|
|
|
|
|
" -bucket number of buckets [" + bucket + "]\n" +
|
|
|
|
|
|
|
|
" -minn min length of char ngram [" + minn + "]\n" +
|
|
|
|
|
|
|
|
" -maxn max length of char ngram [" + maxn + "]\n" +
|
|
|
|
|
|
|
|
" -t sampling threshold [" + t + "]\n" +
|
|
|
|
|
|
|
|
" -label labels prefix [" + label + "]\n";
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
private string printTrainingHelp()
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
"\nThe following arguments for training are optional:\n" +
|
|
|
|
|
|
|
|
" -lr learning rate [" + lr + "]\n" +
|
|
|
|
|
|
|
|
" -lrUpdateRate change the rate of updates for the learning rate [" + lrUpdateRate + "]\n" +
|
|
|
|
|
|
|
|
" -dim size of word vectors [" + dim + "]\n" +
|
|
|
|
|
|
|
|
" -ws size of the context window [" + ws + "]\n" +
|
|
|
|
|
|
|
|
" -epoch number of epochs [" + epoch + "]\n" +
|
|
|
|
|
|
|
|
" -neg number of negatives sampled [" + neg + "]\n" +
|
|
|
|
|
|
|
|
" -loss loss function {ns, hs, softmax} [" + lossToString(loss) + "]\n" +
|
|
|
|
|
|
|
|
" -thread number of threads [" + thread + "]\n" +
|
|
|
|
|
|
|
|
" -pretrainedVectors pretrained word vectors for supervised learning [" + pretrainedVectors + "]\n" +
|
|
|
|
|
|
|
|
" -saveOutput whether output params should be saved [" + boolToString(saveOutput) + "]\n";
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
private string printQuantizationHelp()
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
"\nThe following arguments for quantization are optional:\n" +
|
|
|
|
|
|
|
|
" -cutoff number of words and ngrams to retain [" + cutoff + "]\n" +
|
|
|
|
|
|
|
|
" -retrain whether embeddings are finetuned if a cutoff is applied [" + boolToString(retrain) + "]\n" +
|
|
|
|
|
|
|
|
" -qnorm whether the norm is quantized separately [" + boolToString(qnorm) + "]\n" +
|
|
|
|
|
|
|
|
" -qout whether the classifier is quantized [" + boolToString(qout) + "]\n" +
|
|
|
|
|
|
|
|
" -dsub size of each sub-vector [" + dsub + "]\n";
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
#endregion
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|