pull/1/head
Ogoun 5 years ago
parent cc54d87d87
commit 36a429b45b

@ -9,6 +9,8 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Lemmatization", "..\Lemmati
EndProject EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ZeroLevel", "..\ZeroLevel\ZeroLevel.csproj", "{6AF46F95-EA67-4258-96B1-7BBC57EB965D}" Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ZeroLevel", "..\ZeroLevel\ZeroLevel.csproj", "{6AF46F95-EA67-4258-96B1-7BBC57EB965D}"
EndProject EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Tests", "Tests\Tests.csproj", "{41061774-D2A1-4291-8909-62E4A63B03B4}"
EndProject
Global Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU Debug|Any CPU = Debug|Any CPU
@ -41,6 +43,14 @@ Global
{6AF46F95-EA67-4258-96B1-7BBC57EB965D}.Release|Any CPU.Build.0 = Release|Any CPU {6AF46F95-EA67-4258-96B1-7BBC57EB965D}.Release|Any CPU.Build.0 = Release|Any CPU
{6AF46F95-EA67-4258-96B1-7BBC57EB965D}.Release|x64.ActiveCfg = Release|x64 {6AF46F95-EA67-4258-96B1-7BBC57EB965D}.Release|x64.ActiveCfg = Release|x64
{6AF46F95-EA67-4258-96B1-7BBC57EB965D}.Release|x64.Build.0 = Release|x64 {6AF46F95-EA67-4258-96B1-7BBC57EB965D}.Release|x64.Build.0 = Release|x64
{41061774-D2A1-4291-8909-62E4A63B03B4}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{41061774-D2A1-4291-8909-62E4A63B03B4}.Debug|Any CPU.Build.0 = Debug|Any CPU
{41061774-D2A1-4291-8909-62E4A63B03B4}.Debug|x64.ActiveCfg = Debug|Any CPU
{41061774-D2A1-4291-8909-62E4A63B03B4}.Debug|x64.Build.0 = Debug|Any CPU
{41061774-D2A1-4291-8909-62E4A63B03B4}.Release|Any CPU.ActiveCfg = Release|Any CPU
{41061774-D2A1-4291-8909-62E4A63B03B4}.Release|Any CPU.Build.0 = Release|Any CPU
{41061774-D2A1-4291-8909-62E4A63B03B4}.Release|x64.ActiveCfg = Release|Any CPU
{41061774-D2A1-4291-8909-62E4A63B03B4}.Release|x64.Build.0 = Release|Any CPU
EndGlobalSection EndGlobalSection
GlobalSection(SolutionProperties) = preSolution GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE HideSolutionNode = FALSE

@ -6,48 +6,9 @@ using System.Linq;
using System.Threading; using System.Threading;
using TFIDFbee.Reader; using TFIDFbee.Reader;
using ZeroLevel.Services.Semantic; using ZeroLevel.Services.Semantic;
using ZeroLevel.Services.Semantic.Helpers;
namespace TFIDFbee namespace TFIDFbee
{ {
public class IDF
{
private ConcurrentDictionary<string, int> _terms =
new ConcurrentDictionary<string, int>();
private long _documents_count = 0;
public void Learn(BagOfTerms bag)
{
_documents_count++;
foreach (var term in bag.ToUniqueTokens())
{
_terms.AddOrUpdate(term, 1, (w, o) => o + 1);
}
}
public double Idf(string term)
{
if (_terms.ContainsKey(term))
{
double count_documents_with_term = (double)_terms[term];
double total_documents = (double)_documents_count;
return Math.Log(1.0d + (total_documents / count_documents_with_term));
}
return 0.0d;
}
}
public static class TFIDF
{
public static IDictionary<string, double> TfIdf(BagOfTerms document, IDF idf)
{
var freg = document.Freguency();
return document
.ToUniqueTokensWithoutStopWords()
.ToDictionary(t => t, t => idf.Idf(t) * (double)freg[t] / (double)document.Words.Length);
}
}
class Program class Program
{ {
private const string source = @"D:\Desktop\lenta-ru-data-set_19990901_20171204_limit_1000.json"; private const string source = @"D:\Desktop\lenta-ru-data-set_19990901_20171204_limit_1000.json";
@ -62,7 +23,7 @@ namespace TFIDFbee
{ {
foreach (var doc in batch) foreach (var doc in batch)
{ {
idf.Learn(doc); idf.Append(doc);
} }
} }
foreach (var batch in reader.ReadBatches(1000)) foreach (var batch in reader.ReadBatches(1000))

@ -0,0 +1,57 @@
using System;
using System.Collections.Generic;
using System.Net;
using System.Net.Http;
using ZeroLevel.Services.Web;
namespace Tests
{
class Program
{
public String responseToWords(String response)
{
response = response
.ToLowerInvariant()
.Replace("<title>", ",")
.Replace("</title>", ",")
.Replace("<meta name=\"description\" content=", ",")
.Replace("<meta name=\"keywords\" content=", ",");
response = response
.Replace("[^a-zA-Zа-яА-Я\\w\\s]]*", ",")
.Replace(" ", ",");
var array = new List<string>();
foreach (String word in response.Split(","))
{
if (!string.IsNullOrWhiteSpace(word) && word.Length > 1)
{
array.Add(word);
}
}
array.Sort();
response = string.Join(' ', array);
return response;
}
private HttpClient GetClient()
{
var handler = new HttpClientHandler
{
AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate,
UseDefaultCredentials = ZeroLevel.Configuration.Default.FirstOrDefault<bool>("useDefaultCredentianls"),
ServerCertificateCustomValidationCallback = (sender, cert, chain, sslPolicyErrors) => { return true; }
};
if (ZeroLevel.Configuration.Default.FirstOrDefault<bool>("useDefaultCredentianls"))
{
handler.DefaultProxyCredentials = CredentialCache.DefaultCredentials;
}
var httpClient = new HttpClient(handler);
httpClient.DefaultRequestHeaders.Add("user-agent", UserAgents.Next());
return httpClient;
}
static void Main(string[] args)
{
Console.WriteLine("Hello World!");
}
}
}

@ -0,0 +1,12 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>netcoreapp3.1</TargetFramework>
</PropertyGroup>
<ItemGroup>
<ProjectReference Include="..\..\ZeroLevel\ZeroLevel.csproj" />
</ItemGroup>
</Project>

@ -1,5 +1,4 @@
using Iveonik.Stemmers; using System;
using System;
using System.Collections.Concurrent; using System.Collections.Concurrent;
using System.Collections.Generic; using System.Collections.Generic;
using System.Linq; using System.Linq;
@ -15,21 +14,21 @@ namespace ZeroLevel.Services.Semantic.Helpers
private string[] _words; private string[] _words;
private ILexProvider _lexer; private ILexProvider _lexer;
public BagOfTerms(string text) : this(TextAnalizer.ExtractWords(text).ToArray(), new LexProvider(new RussianStemmer())) { } public BagOfTerms(string text) : this(TextAnalizer.ExtractWords(text).ToArray(), null) { }
public BagOfTerms(string text, ILexProvider lexer) : this(TextAnalizer.ExtractWords(text).ToArray(), lexer) { } public BagOfTerms(string text, ILexProvider lexer) : this(TextAnalizer.ExtractWords(text).ToArray(), lexer) { }
public BagOfTerms(IEnumerable<string> words) : this(words.ToArray(), new LexProvider(new RussianStemmer())) { } public BagOfTerms(IEnumerable<string> words) : this(words.ToArray(), null) { }
public BagOfTerms(IEnumerable<string> words, ILexProvider lexer) : this(words.ToArray(), lexer) { } public BagOfTerms(IEnumerable<string> words, ILexProvider lexer) : this(words.ToArray(), lexer) { }
public BagOfTerms(string[] words) : this(words, new LexProvider(new RussianStemmer())) { } public BagOfTerms(string[] words) : this(words, null) { }
public BagOfTerms(string[] words, ILexProvider lexer) public BagOfTerms(string[] words, ILexProvider lexer)
{ {
_lexer = lexer; _lexer = lexer;
_frequency = null; _frequency = null;
_words = _lexer.ExtractLexTokens(words).Select(t => t.Token).ToArray(); _words = _lexer == null ? words : _lexer.ExtractLexTokens(words).Select(t => t.Token).ToArray();
} }
public string[] Words => _words; public string[] Words => _words;

File diff suppressed because one or more lines are too long

@ -0,0 +1,90 @@
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Linq;
using ZeroLevel.Services.Semantic.Helpers;
namespace ZeroLevel.Services.Semantic
{
public class IDF
{
private ConcurrentDictionary<string, int> _terms =
new ConcurrentDictionary<string, int>();
private long _documents_count = 0;
public void Append(BagOfTerms bag)
{
_documents_count++;
foreach (var term in bag.ToUniqueTokens())
{
_terms.AddOrUpdate(term, 1, (w, o) => o + 1);
}
}
public double Idf(string term)
{
if (_terms.ContainsKey(term))
{
double count_documents_with_term = (double)_terms[term];
double total_documents = (double)_documents_count;
return Math.Log(1.0d + (total_documents / count_documents_with_term));
}
return 0.0d;
}
}
public static class TFIDF
{
private static readonly IReadOnlyDictionary<string, double> _empty = new Dictionary<string, double>();
public static IReadOnlyDictionary<string, double> TfIdf(BagOfTerms document, IDF idf)
{
if (document.Words.Length > 0)
{
var freg = document.Freguency();
return document
.ToUniqueTokensWithoutStopWords()
.ToDictionary(t => t, t => idf.Idf(t) * (double)freg[t] / (double)document.Words.Length);
}
return _empty;
}
public static IReadOnlyDictionary<string, double> TfIdf_Smooth(BagOfTerms document, IDF idf)
{
if (document.Words.Length > 0)
{
var freg = document.Freguency();
var max = (double)freg.Max(f => f.Value);
return document
.ToUniqueTokensWithoutStopWords()
.ToDictionary(t => t, t => idf.Idf(t) * (0.5d + 0.5d * ((double)freg[t] / max)));
}
return _empty;
}
public static IReadOnlyDictionary<string, double> Tf(BagOfTerms document)
{
if (document.Words.Length > 0)
{
var freg = document.Freguency();
return document
.ToUniqueTokensWithoutStopWords()
.ToDictionary(t => t, t => (double)freg[t] / (double)document.Words.Length);
}
return _empty;
}
public static IReadOnlyDictionary<string, double> Tf_Smooth(BagOfTerms document)
{
if (document.Words.Length > 0)
{
var freg = document.Freguency();
var max = (double)freg.Max(f => f.Value);
return document
.ToUniqueTokensWithoutStopWords()
.ToDictionary(t => t, t => (0.5d + 0.5d * ((double)freg[t] / max)));
}
return _empty;
}
}
}

@ -0,0 +1,60 @@
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Threading;
namespace ZeroLevel.Utils
{
public class Multiprocessor<T>
: IDisposable
{
private BlockingCollection<T> _queue = new BlockingCollection<T>();
private List<Thread> _threads = new List<Thread>();
public Multiprocessor(Action<T> handler, int size, int stackSize = 1024 * 256)
{
for (int i = 0; i < size; i++)
{
var t = new Thread(() =>
{
try
{
T item;
while (!_queue.IsCompleted)
{
if (_queue.TryTake(out item, 200))
{
handler(item);
}
}
}
catch { }
}, stackSize);
t.IsBackground = true;
_threads.Add(t);
}
foreach (var t in _threads) t.Start();
}
public void Append(T t) => _queue.Add(t);
public void WaitForEmpty()
{
while (_queue.Count > 0)
{
Thread.Sleep(100);
}
}
public void Dispose()
{
_queue.CompleteAdding();
while (_queue.Count > 0)
{
Thread.Sleep(100);
}
Thread.Sleep(3000); // wait while threads exit
_queue.Dispose();
}
}
}
Loading…
Cancel
Save

Powered by TurnKey Linux.