From 21e57e3d5fe889c4cef3dbf771f0632dc7ec5bd7 Mon Sep 17 00:00:00 2001
From: Ogoun <ogoun.er@yandex.ru>
Date: Fri, 20 Mar 2020 05:08:17 +0300
Subject: [PATCH] FastText start

---
 .../Services/Semantic/Fasttext/FTArgs.cs      | 164 ++++++++++++++++++
 ZeroLevel/Services/Semantic/Fasttext/enums.cs |   5 +
 2 files changed, 169 insertions(+)
 create mode 100644 ZeroLevel/Services/Semantic/Fasttext/FTArgs.cs
 create mode 100644 ZeroLevel/Services/Semantic/Fasttext/enums.cs

diff --git a/ZeroLevel/Services/Semantic/Fasttext/FTArgs.cs b/ZeroLevel/Services/Semantic/Fasttext/FTArgs.cs
new file mode 100644
index 0000000..7fd1ad8
--- /dev/null
+++ b/ZeroLevel/Services/Semantic/Fasttext/FTArgs.cs
@@ -0,0 +1,164 @@
+﻿namespace ZeroLevel.Services.Semantic.Fasttext
+{
+    public class FTArgs
+    {
+        #region Args
+        public double lr;
+        public int lrUpdateRate;
+        public int dim;
+        public int ws;
+        public int epoch;
+        public int minCount;
+        public int minCountLabel;
+        public int neg;
+        public int wordNgrams;
+        public loss_name loss;
+        public model_name model;
+        public int bucket;
+        public int minn;
+        public int maxn;
+        public int thread;
+        public double t;
+        public string label;
+        public int verbose;
+        public string pretrainedVectors;
+        public bool saveOutput;
+        public bool qout;
+        public bool retrain;
+        public bool qnorm;
+        public ulong cutoff;
+        public ulong dsub;
+        #endregion
+
+        public FTArgs()
+        {
+            lr = 0.05;
+            dim = 100;
+            ws = 5;
+            epoch = 5;
+            minCount = 5;
+            minCountLabel = 0;
+            neg = 5;
+            wordNgrams = 1;
+            loss = loss_name.ns;
+            model = model_name.sg;
+            bucket = 2000000;
+            minn = 3;
+            maxn = 6;
+            thread = 12;
+            lrUpdateRate = 100;
+            t = 1e-4;
+            label = "__label__";
+            verbose = 2;
+            pretrainedVectors = "";
+            saveOutput = false;
+            qout = false;
+            retrain = false;
+            qnorm = false;
+            cutoff = 0;
+            dsub = 2;
+        }
+
+        protected string lossToString(loss_name ln)
+        {
+            switch (ln)
+            {
+                case loss_name.hs:
+                    return "hs";
+                case loss_name.ns:
+                    return "ns";
+                case loss_name.softmax:
+                    return "softmax";
+            }
+            return "Unknown loss!"; // should never happen
+        }
+
+        protected string boolToString(bool b)
+        {
+            if (b)
+            {
+                return "true";
+            }
+            else
+            {
+                return "false";
+            }
+        }
+
+        protected string modelToString(model_name mn)
+        {
+            switch (mn)
+            {
+                case model_name.cbow:
+                    return "cbow";
+                case model_name.sg:
+                    return "sg";
+                case model_name.sup:
+                    return "sup";
+            }
+            return "Unknown model name!"; // should never happen
+        }
+
+        #region Help
+        public string printHelp()
+        {
+            return
+                printBasicHelp() +
+                printDictionaryHelp() +
+                printTrainingHelp() +
+                printQuantizationHelp();
+        }
+
+
+        private string printBasicHelp()
+        {
+            return "\nThe following arguments are mandatory:\n" +
+              "  -input              training file path\n" +
+              "  -output             output file path\n" +
+              "\nThe following arguments are optional:\n" +
+              "  -verbose            verbosity level [" + verbose + "]\n";
+        }
+
+        private string printDictionaryHelp()
+        {
+            return
+              "\nThe following arguments for the dictionary are optional:\n" +
+              "  -minCount           minimal number of word occurences [" + minCount + "]\n" +
+              "  -minCountLabel      minimal number of label occurences [" + minCountLabel + "]\n" +
+              "  -wordNgrams         max length of word ngram [" + wordNgrams + "]\n" +
+              "  -bucket             number of buckets [" + bucket + "]\n" +
+              "  -minn               min length of char ngram [" + minn + "]\n" +
+              "  -maxn               max length of char ngram [" + maxn + "]\n" +
+              "  -t                  sampling threshold [" + t + "]\n" +
+              "  -label              labels prefix [" + label + "]\n";
+        }
+
+        private string printTrainingHelp()
+        {
+            return
+              "\nThe following arguments for training are optional:\n" +
+              "  -lr                 learning rate [" + lr + "]\n" +
+              "  -lrUpdateRate       change the rate of updates for the learning rate [" + lrUpdateRate + "]\n" +
+              "  -dim                size of word vectors [" + dim + "]\n" +
+              "  -ws                 size of the context window [" + ws + "]\n" +
+              "  -epoch              number of epochs [" + epoch + "]\n" +
+              "  -neg                number of negatives sampled [" + neg + "]\n" +
+              "  -loss               loss function {ns, hs, softmax} [" + lossToString(loss) + "]\n" +
+              "  -thread             number of threads [" + thread + "]\n" +
+              "  -pretrainedVectors  pretrained word vectors for supervised learning [" + pretrainedVectors + "]\n" +
+              "  -saveOutput         whether output params should be saved [" + boolToString(saveOutput) + "]\n";
+        }
+
+        private string printQuantizationHelp()
+        {
+            return
+              "\nThe following arguments for quantization are optional:\n" +
+              "  -cutoff             number of words and ngrams to retain [" + cutoff + "]\n" +
+              "  -retrain            whether embeddings are finetuned if a cutoff is applied [" + boolToString(retrain) + "]\n" +
+              "  -qnorm              whether the norm is quantized separately [" + boolToString(qnorm) + "]\n" +
+              "  -qout               whether the classifier is quantized [" + boolToString(qout) + "]\n" +
+              "  -dsub               size of each sub-vector [" + dsub + "]\n";
+        }
+        #endregion
+    }
+}
diff --git a/ZeroLevel/Services/Semantic/Fasttext/enums.cs b/ZeroLevel/Services/Semantic/Fasttext/enums.cs
new file mode 100644
index 0000000..b081a14
--- /dev/null
+++ b/ZeroLevel/Services/Semantic/Fasttext/enums.cs
@@ -0,0 +1,5 @@
+﻿namespace ZeroLevel.Services.Semantic.Fasttext
+{
+    public enum model_name : int { cbow = 1, sg, sup };
+    public enum loss_name : int { hs = 1, ns, softmax };
+}