|
|
@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
using System;
|
|
|
|
|
|
|
|
using System.Collections.Concurrent;
|
|
|
|
|
|
|
|
using System.Collections.Generic;
|
|
|
|
|
|
|
|
using System.Linq;
|
|
|
|
|
|
|
|
using ZeroLevel.Services.Semantic.Helpers;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
namespace ZeroLevel.Services.Semantic
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
public class IDF
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
private ConcurrentDictionary<string, int> _terms =
|
|
|
|
|
|
|
|
new ConcurrentDictionary<string, int>();
|
|
|
|
|
|
|
|
private long _documents_count = 0;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
public void Append(BagOfTerms bag)
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
_documents_count++;
|
|
|
|
|
|
|
|
foreach (var term in bag.ToUniqueTokens())
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
_terms.AddOrUpdate(term, 1, (w, o) => o + 1);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
public double Idf(string term)
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
if (_terms.ContainsKey(term))
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
double count_documents_with_term = (double)_terms[term];
|
|
|
|
|
|
|
|
double total_documents = (double)_documents_count;
|
|
|
|
|
|
|
|
return Math.Log(1.0d + (total_documents / count_documents_with_term));
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0.0d;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
public static class TFIDF
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
private static readonly IReadOnlyDictionary<string, double> _empty = new Dictionary<string, double>();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
public static IReadOnlyDictionary<string, double> TfIdf(BagOfTerms document, IDF idf)
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
if (document.Words.Length > 0)
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
var freg = document.Freguency();
|
|
|
|
|
|
|
|
return document
|
|
|
|
|
|
|
|
.ToUniqueTokensWithoutStopWords()
|
|
|
|
|
|
|
|
.ToDictionary(t => t, t => idf.Idf(t) * (double)freg[t] / (double)document.Words.Length);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
return _empty;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
public static IReadOnlyDictionary<string, double> TfIdf_Smooth(BagOfTerms document, IDF idf)
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
if (document.Words.Length > 0)
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
var freg = document.Freguency();
|
|
|
|
|
|
|
|
var max = (double)freg.Max(f => f.Value);
|
|
|
|
|
|
|
|
return document
|
|
|
|
|
|
|
|
.ToUniqueTokensWithoutStopWords()
|
|
|
|
|
|
|
|
.ToDictionary(t => t, t => idf.Idf(t) * (0.5d + 0.5d * ((double)freg[t] / max)));
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
return _empty;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
public static IReadOnlyDictionary<string, double> Tf(BagOfTerms document)
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
if (document.Words.Length > 0)
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
var freg = document.Freguency();
|
|
|
|
|
|
|
|
return document
|
|
|
|
|
|
|
|
.ToUniqueTokensWithoutStopWords()
|
|
|
|
|
|
|
|
.ToDictionary(t => t, t => (double)freg[t] / (double)document.Words.Length);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
return _empty;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
public static IReadOnlyDictionary<string, double> Tf_Smooth(BagOfTerms document)
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
if (document.Words.Length > 0)
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
var freg = document.Freguency();
|
|
|
|
|
|
|
|
var max = (double)freg.Max(f => f.Value);
|
|
|
|
|
|
|
|
return document
|
|
|
|
|
|
|
|
.ToUniqueTokensWithoutStopWords()
|
|
|
|
|
|
|
|
.ToDictionary(t => t, t => (0.5d + 0.5d * ((double)freg[t] / max)));
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
return _empty;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|