pull/1/head
Ogoun 5 years ago
parent 8faa55fb17
commit ed1983b715

@ -0,0 +1,51 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 16
VisualStudioVersion = 16.0.29709.97
MinimumVisualStudioVersion = 10.0.40219.1
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "TFIDFbee", "TFIDFbee\TFIDFbee.csproj", "{7B39E0A1-3DE4-4702-8D61-5C9A6CF164C6}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Lemmatization", "..\HoKeMs\Lemmatization\Lemmatization.csproj", "{BF9F7C1E-098B-4815-BA35-8A9845C66663}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ZeroLevel", "..\..\GIT\Zero\ZeroLevel\ZeroLevel.csproj", "{5FF0C954-7FB8-49F4-9E97-9DCC933D45FF}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Debug|x64 = Debug|x64
Release|Any CPU = Release|Any CPU
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{7B39E0A1-3DE4-4702-8D61-5C9A6CF164C6}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{7B39E0A1-3DE4-4702-8D61-5C9A6CF164C6}.Debug|Any CPU.Build.0 = Debug|Any CPU
{7B39E0A1-3DE4-4702-8D61-5C9A6CF164C6}.Debug|x64.ActiveCfg = Debug|x64
{7B39E0A1-3DE4-4702-8D61-5C9A6CF164C6}.Debug|x64.Build.0 = Debug|x64
{7B39E0A1-3DE4-4702-8D61-5C9A6CF164C6}.Release|Any CPU.ActiveCfg = Release|Any CPU
{7B39E0A1-3DE4-4702-8D61-5C9A6CF164C6}.Release|Any CPU.Build.0 = Release|Any CPU
{7B39E0A1-3DE4-4702-8D61-5C9A6CF164C6}.Release|x64.ActiveCfg = Release|Any CPU
{7B39E0A1-3DE4-4702-8D61-5C9A6CF164C6}.Release|x64.Build.0 = Release|Any CPU
{BF9F7C1E-098B-4815-BA35-8A9845C66663}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{BF9F7C1E-098B-4815-BA35-8A9845C66663}.Debug|Any CPU.Build.0 = Debug|Any CPU
{BF9F7C1E-098B-4815-BA35-8A9845C66663}.Debug|x64.ActiveCfg = Debug|x64
{BF9F7C1E-098B-4815-BA35-8A9845C66663}.Debug|x64.Build.0 = Debug|x64
{BF9F7C1E-098B-4815-BA35-8A9845C66663}.Release|Any CPU.ActiveCfg = Release|Any CPU
{BF9F7C1E-098B-4815-BA35-8A9845C66663}.Release|Any CPU.Build.0 = Release|Any CPU
{BF9F7C1E-098B-4815-BA35-8A9845C66663}.Release|x64.ActiveCfg = Release|Any CPU
{BF9F7C1E-098B-4815-BA35-8A9845C66663}.Release|x64.Build.0 = Release|Any CPU
{5FF0C954-7FB8-49F4-9E97-9DCC933D45FF}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{5FF0C954-7FB8-49F4-9E97-9DCC933D45FF}.Debug|Any CPU.Build.0 = Debug|Any CPU
{5FF0C954-7FB8-49F4-9E97-9DCC933D45FF}.Debug|x64.ActiveCfg = Debug|x64
{5FF0C954-7FB8-49F4-9E97-9DCC933D45FF}.Debug|x64.Build.0 = Debug|x64
{5FF0C954-7FB8-49F4-9E97-9DCC933D45FF}.Release|Any CPU.ActiveCfg = Release|Any CPU
{5FF0C954-7FB8-49F4-9E97-9DCC933D45FF}.Release|Any CPU.Build.0 = Release|Any CPU
{5FF0C954-7FB8-49F4-9E97-9DCC933D45FF}.Release|x64.ActiveCfg = Release|x64
{5FF0C954-7FB8-49F4-9E97-9DCC933D45FF}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {F440F94E-537A-44F4-9103-BD6C7BCAF6E3}
EndGlobalSection
EndGlobal

@ -0,0 +1,8 @@
namespace TFIDFbee
{
public class Document
{
public string Title { get; set; }
public string Text { get; set; }
}
}

@ -0,0 +1,367 @@
using Accord.MachineLearning;
using Lemmatization;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using ZeroLevel;
using ZeroLevel.Services.Semantic;
using ZeroLevel.Services.Semantic.Helpers;
namespace TFIDFbee
{
class Program
{
private const string source = @"D:\Desktop\lenta-ru-data-set_19990901_20171204.json";
private readonly static ILexProvider _lexer = new LexProvider(new LemmaLexer());
static void Main(string[] args)
{
Configuration.Save(Configuration.ReadFromApplicationConfig());
/*var codebook = new TFIDF()
{
Tf = TermFrequency.Log,
Idf = InverseDocumentFrequency.Default,
UpdateDictionary = true
};*/
var codebook = new ZeroLevel.Services.Semantic.Helpers.BagOfWords();
foreach (var batch in ParseBatches(1000))
{
codebook.Learn(batch);
Console.WriteLine($"Documents: {codebook.NumberOfDocuments}");
Console.WriteLine($"Words: {codebook.NumberOfWords}");
}
var vectors = new List<SparceVector>();
foreach (var docs in ReadRawDocumentBatches(1000))
{
foreach (var doc in docs)
{
var words = _lexer.ExtractLexTokens(doc.Item2).Select(t => t.Token)/*.Concat(_lexer.ExtractLexTokens(doc.Text).Select(t => t.Token))*/.ToArray();
vectors.Add(codebook.Transform(words));
}
}
var list = new List<Tuple<double, int, int>>();
for (int i = 0; i < vectors.Count; i++)
{
for (int j = i + 1; j < vectors.Count - 1; j++)
{
if (i == j) continue;
var diff = vectors[i].Measure(vectors[j]);
if (diff > double.Epsilon)
{
list.Add(Tuple.Create(diff, i, j));
}
}
}
var to_present = list.OrderBy(e => e.Item1).Take(200).ToArray();
var to_present_map = new Dictionary<int, Tuple<string, string>>();
foreach (var e in to_present)
{
if (!to_present_map.ContainsKey(e.Item2))
{
to_present_map.Add(e.Item2, null);
}
if (!to_present_map.ContainsKey(e.Item3))
{
to_present_map.Add(e.Item3, null);
}
}
int index = 0;
foreach (var docs in ReadRawDocumentBatches(1000))
{
foreach (var doc in docs)
{
if (to_present_map.ContainsKey(index))
{
to_present_map[index] = doc;
}
index++;
}
}
index = 0;
foreach (var e in to_present)
{
Console.WriteLine($"#{index++}: {e.Item1}");
Console.WriteLine(to_present_map[e.Item2].Item1);
Console.WriteLine(to_present_map[e.Item3].Item2);
Console.WriteLine("--------------------");
Console.WriteLine();
}
Console.WriteLine("Completed");
Console.ReadKey();
}
private static IEnumerable<string> ExtractLemmas(string text)
{
return
_lexer.ExtractUniqueLexTokensWithoutStopWords(text)
.Select(t => t.Token)
.Where(s => s.Any(c => char.IsLetter(c)));
}
public static IEnumerable<string[][]> ReadBatches(int size)
{
var list = new List<string[]>();
foreach (var batch in ReadDocumentBatches(size))
{
yield return batch.ToArray();
list.Clear();
}
}
public static IEnumerable<IEnumerable<string[]>> ReadDocumentBatches(int size)
{
string line;
var batch = new List<string[]>();
string title = null;
string text = null;
using (StreamReader reader = new StreamReader(source))
{
while ((line = reader.ReadLine()) != null)
{
var titleIndex = line.IndexOf("\"metaTitle\":");
if (titleIndex >= 0)
{
var start = line.IndexOf("\"", titleIndex + 12);
var end = line.LastIndexOf("\"");
if (start < end && start != -1 && end != -1)
{
title = line.Substring(start + 1, end - start - 1);
}
}
else
{
var textIndex = line.IndexOf("\"plaintext\":");
if (textIndex >= 0 && title != null)
{
var start = line.IndexOf("\"", textIndex + 12);
var end = line.LastIndexOf("\"");
if (start < end && start != -1 && end != -1)
{
text = line.Substring(start + 1, end - start - 1);
batch.Add(ExtractLemmas(title).Concat(ExtractLemmas(text)).ToArray());
if (batch.Count >= size)
{
yield return batch;
batch.Clear();
GC.Collect(2);
}
title = null;
text = null;
}
}
}
}
}
if (batch.Count > 0)
{
yield return batch;
}
}
public static IEnumerable<IEnumerable<Tuple<string, string>>> ReadRawDocumentBatches(int size)
{
string line;
var batch = new List<Tuple<string, string>>();
string title = null;
string text = null;
using (StreamReader reader = new StreamReader(source))
{
while ((line = reader.ReadLine()) != null)
{
var titleIndex = line.IndexOf("\"metaTitle\":");
if (titleIndex >= 0)
{
var start = line.IndexOf("\"", titleIndex + 12);
var end = line.LastIndexOf("\"");
if (start < end && start != -1 && end != -1)
{
title = line.Substring(start + 1, end - start - 1);
}
}
else
{
var textIndex = line.IndexOf("\"plaintext\":");
if (textIndex >= 0 && title != null)
{
var start = line.IndexOf("\"", textIndex + 12);
var end = line.LastIndexOf("\"");
if (start < end && start != -1 && end != -1)
{
text = line.Substring(start + 1, end - start - 1);
batch.Add(Tuple.Create(title, text));
if (batch.Count >= size)
{
yield return batch;
batch.Clear();
GC.Collect(2);
}
title = null;
text = null;
}
}
}
}
}
if (batch.Count > 0)
{
yield return batch;
}
}
private class RecordParser
{
private enum RPState
{
WaitKey,
ParseKey,
WaitKeyConfirm,
WaitValue,
ParseValue
}
private readonly StringBuilder _builder = new StringBuilder();
private RPState State = RPState.WaitKey;
private char _previous = '\0';
private string _key;
private string _value;
private readonly Action<string, string> _callback;
public RecordParser(Action<string, string> callback)
{
_callback = callback;
}
public void Append(string text)
{
foreach (var ch in text)
{
switch (State)
{
case RPState.WaitKey:
if (ch.Equals('"'))
{
State = RPState.ParseKey;
_builder.Clear();
}
break;
case RPState.ParseKey:
if (ch.Equals('"') && _previous != '\\')
{
if (_builder.Length > 0)
{
State = RPState.WaitKeyConfirm;
}
else
{
State = RPState.WaitKey;
}
}
else
{
_builder.Append(ch);
}
break;
case RPState.WaitKeyConfirm:
if (ch.Equals(':'))
{
_key = _builder.ToString();
State = RPState.WaitValue;
}
else if (ch == ' ' || ch == '\r' || ch == '\n')
{
// nothing
}
else
{
State = RPState.WaitKey;
}
break;
case RPState.WaitValue:
if (ch.Equals('"'))
{
State = RPState.ParseValue;
_builder.Clear();
}
else if (ch == ' ' || ch == '\r' || ch == '\n')
{
// nothing
}
else
{
State = RPState.WaitKey;
}
break;
case RPState.ParseValue:
if (ch.Equals('"') && _previous != '\\')
{
if (_builder.Length > 0)
{
_value = _builder.ToString();
_callback(_key, _value);
}
State = RPState.WaitKey;
}
else
{
_builder.Append(ch);
}
break;
}
_previous = ch;
}
}
}
public static IEnumerable<string[][]> ParseBatches(int size)
{
var list = new List<string[]>();
foreach (var record in Parse())
{
list.Add(record);
if (list.Count > size)
{
yield return list.ToArray();
list.Clear();
}
}
if (list.Count > 0)
{
yield return list.ToArray();
}
}
public static IEnumerable<string[]> Parse()
{
var result = new string[2];
var parser = new RecordParser((k, v) =>
{
switch (k)
{
case "metaTitle": result[0] = v; break;
case "plaintext": result[1] = v; break;
}
});
char[] buffer = new char[16536];
int count = 0;
using (StreamReader reader = new StreamReader(source))
{
count = reader.Read(buffer, 0, buffer.Length);
parser.Append(new string(buffer, 0, count));
if (!string.IsNullOrEmpty(result[0]) && !string.IsNullOrEmpty(result[1]))
{
yield return result;
result[0] = null;
result[1] = null;
}
}
}
}
}

@ -0,0 +1,19 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>netcoreapp3.1</TargetFramework>
<Platforms>AnyCPU;x64</Platforms>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="Accord" Version="3.8.0" />
<PackageReference Include="Accord.MachineLearning" Version="3.8.0" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\..\..\GIT\Zero\ZeroLevel\ZeroLevel.csproj" />
<ProjectReference Include="..\..\HoKeMs\Lemmatization\Lemmatization.csproj" />
</ItemGroup>
</Project>

@ -1,8 +1,8 @@
using Newtonsoft.Json;
using System;
using System.IO;
using ZeroLevel;
using ZeroLevel.Logging;
using ZeroLevel.Services.Web;
namespace TestApp
{
@ -22,25 +22,6 @@ namespace TestApp
private static void Main(string[] args)
{
/*var fiber = new Fiber();
fiber
.Add((s) => { Console.WriteLine("1"); s.Add<int>("1", 1); return s; })
.Add((s) => { Console.WriteLine("2"); s.Add<int>("2", 2); return s; })
.Add((s) => { Console.WriteLine("3"); s.Add<int>("3", 3); return s; })
.Add((s) => { Console.WriteLine("4"); s.Add<int>("4", 4); return s; })
.Add((s) => { Console.WriteLine("5"); s.Add<int>("5", 5); return s; });
var result = fiber.Run();
Console.WriteLine();
Console.WriteLine("Result");
foreach (var key in result.Keys<int>())
{
Console.WriteLine($"{key}: {result.Get<int>(key)}");
}*/
Configuration.Save(Configuration.ReadFromApplicationConfig());
Bootstrap.Startup<MyService>(args,
() => Configuration.ReadSetFromIniFile("config.ini"))

@ -2,7 +2,7 @@
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>netcoreapp2.2</TargetFramework>
<TargetFramework>netcoreapp3.0</TargetFramework>
</PropertyGroup>
<ItemGroup>

@ -36,6 +36,10 @@ namespace ZeroLevel
BaseDirectory = Path.GetDirectoryName(assembly.Location);
AppLocation = assembly.Location;
}
else
{
BaseDirectory = Directory.GetCurrentDirectory();
}
}
#endregion Ctor

@ -0,0 +1,96 @@
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Linq;
using System.Threading;
using System.Threading.Tasks;
using ZeroLevel.Services.Serialization;
namespace ZeroLevel.Services.Semantic.Helpers
{
public class BagOfWords :
IBinarySerializable
{
private readonly ConcurrentDictionary<string, int[]> _words =
new ConcurrentDictionary<string, int[]>();
int _words_count = -1;
long _number_of_documents = 0;
public long NumberOfDocuments => _number_of_documents;
public int NumberOfWords => _words.Count;
/// <summary>
/// Набор документов, слова в документе должны быть лемматизированы/стеммированы, и быть уникальными
/// </summary>
/// <param name="documents"></param>
public void Learn(string[][] documents)
{
Parallel.ForEach(documents, doc =>
{
Interlocked.Increment(ref _number_of_documents);
var partition = new Dictionary<string, int[]>();
foreach (var word in doc)
{
if (!_words.ContainsKey(word))
{
if (false == _words.TryAdd(word, new int[2] { Interlocked.Increment(ref _words_count), 1 }))
{
Interlocked.Increment(ref _words[word][1]);
}
}
else
{
Interlocked.Increment(ref _words[word][1]);
}
}
});
}
/// <summary>
///
/// </summary>
/// <param name="doc">Документ - слова в котором должны быть лемматизированы/стеммированы, так же как в модели</param>
/// <returns></returns>
public SparceVector Transform(string[] doc)
{
if (doc == null || doc.Length == 0) return new SparceVector();
var map = new Dictionary<string, int>();
foreach (var word in doc)
{
if (map.ContainsKey(word))
{
map[word]++;
}
else
{
map[word] = 1;
}
}
var result = new Dictionary<int, double>();
foreach (var word in doc)
{
if (_words.ContainsKey(word) && !result.ContainsKey(_words[word][0]))
{
var tf = (double)map[word] / (double)doc.Length;
var idf = Math.Log(_number_of_documents / _words[word][1]);
var tfidf = tf * idf;
if (Math.Abs(tfidf) > double.Epsilon)
{
result.Add(_words[word][0], tfidf);
}
}
}
return new SparceVector(result.Values.ToArray(), result.Keys.ToArray());
}
public void Deserialize(IBinaryReader reader)
{
throw new NotImplementedException();
}
public void Serialize(IBinaryWriter writer)
{
throw new NotImplementedException();
}
}
}

@ -0,0 +1,101 @@
using System;
using System.Collections.Generic;
using ZeroLevel.Services.Serialization;
namespace ZeroLevel.Services.Semantic.Helpers
{
public sealed class SparceVector
: IBinarySerializable
{
private readonly static int[] EmptyIndexes = new int[0];
private readonly static double[] EmptyValues = new double[0];
private int[] indexes;
private double[] values;
private double power;
public SparceVector()
{
indexes = EmptyIndexes;
values = EmptyValues;
power = 0;
}
public SparceVector(double[] vector)
{
var l = new List<int>();
for (int i = 0; i < vector.Length; i++)
{
if (Math.Abs(vector[i]) > double.Epsilon)
{
l.Add(i);
}
}
indexes = l.ToArray();
values = new double[l.Count];
power = 0;
for (int i = 0; i < l.Count; i++)
{
values[i] = vector[indexes[i]];
power += values[i] * values[i];
}
power = Math.Sqrt(power);
}
public SparceVector(double[] vector, int[] indicies)
{
indexes = indicies;
values = vector;
power = 0;
for (int i = 0; i < indexes.Length; i++)
{
power += values[i] * values[i];
}
power = Math.Sqrt(power);
}
public double Measure(SparceVector other)
{
double sum = 0.0d;
int li = 0, ri = 0;
int lv, rv;
while (li < this.indexes.Length &&
ri < other.indexes.Length)
{
lv = this.indexes[li];
rv = other.indexes[ri];
if (lv == rv)
{
// у обоих векторов совпадение по индексам
sum += this.values[li] * other.values[ri];
li++; ri++;
}
else if (lv < rv)
{
li++;
}
else
{
ri++;
}
}
return sum / (this.power * other.power);
}
public void Serialize(IBinaryWriter writer)
{
writer.WriteDouble(this.power);
writer.WriteCollection(indexes);
writer.WriteCollection(values);
}
public void Deserialize(IBinaryReader reader)
{
this.power = reader.ReadDouble();
this.indexes = reader.ReadInt32Collection().ToArray();
this.values = reader.ReadDoubleCollection().ToArray();
}
}
}
Loading…
Cancel
Save

Powered by TurnKey Linux.