1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
|
#include "utilities.h"
#include <qdebug.h>
void prepare(std::string& st) {
// std::vector<std::string> stop_words{" a ", " an ", " as ", " about ", " above ", " accross ", " after ",
// " against ", " and ", " are ", " be ", " before ", " along ",
// " around ", " at ", " behind ", " below ", " beetween ", " but ",
// " by ", " down ", " for ", " from ", " in ", " is ", " inside ",
// " into ", " near ", " of ", " off ", " on ", " out ", " outside ",
// " over ", " so ", " till ", " that ", " the ", " to ", " under ", " until ",
// " up ", " with ", " while ", " this "};
// for(std::string& s : stop_words){
// size_t pos = 0;
// while ((pos = st.find(s, pos)) != std::string::npos)
// st.erase(pos, s.size()-1);
// }
std::string::iterator remove = std::remove_if(st.begin(), st.end(), [](char& ch) -> bool {
return (isalpha((unsigned char)ch) || ch == ' ') ? false : true;
});
st.erase(std::remove_if(st.begin(), st.end(),
[](char& ch) -> bool {return (isalpha((unsigned char)ch) || ch == ' ') ? false : true;}),
st.end());
st.erase(std::unique(st.begin(), st.end(),
[](char c1, char c2) { return c1 == ' ' && c2 == ' '; }),
st.end());
std::transform(st.begin(), st.end(), st.begin(), tolower);
}
std::map<std::string, int> getNgramm(const std::string& doc, size_t n) {
std::vector<std::string> s;
std::map<std::string, int> dict;
for (size_t i = 0; i < (doc.size() - n + (size_t)1); ++i) {
const std::string word(&(doc[i]), &(doc[i + n]));
++dict[word];
}
return dict;
}
void n_gram_calc(const std::string& doc, int n)
{
std::map<std::string, int> dict;
int min = 2;
for (int i = 0; i < n - min + 1; ++i) {
std::map<std::string, int> temp_dic(getNgramm(doc, (size_t)min + i));
dict.insert(temp_dic.begin(), temp_dic.end());
}
std::set<std::string> unique;
std::ofstream dictionary_file("dictionary.txt");
int max = std::max_element(dict.begin(), dict.end(),
[](const std::pair<std::string, int> a,
const std::pair<std::string, int> b){return a.second < b.second;})->second;
for (std::map<std::string, int>::iterator it = dict.begin(); it != dict.end(); ++it) {
if (it->second > 1000)
unique.insert(it->first);
}
for (std::string str : unique)
dictionary_file << str << '\n';
dictionary_file.close();
}
std::vector<int> freq_in_chunk(const std::string& chunk, const std::vector<std::string>& dictionary)
{
std::vector<int> freq_in_chunks;
int quantity = 0;
for (std::vector<std::string>::const_iterator it = dictionary.begin(); it != dictionary.end(); ++it) {
size_t pos = 0;
while ((pos = chunk.find(*it, pos)) != std::string::npos) {
++pos;
++quantity;
}
freq_in_chunks.push_back(quantity);
quantity = 0;
}
return freq_in_chunks;
}
long double dzv_calc(int T,
const std::vector<int> di,
const std::vector<int> dj,
int i, int j,
const std::vector<std::vector<int>> freq_of_ngramm_i,
const std::vector<std::vector<int>> freq_of_ngramm_j)
{
return std::abs(zv_calc(T, di, i, freq_of_ngramm_i) +
zv_calc(T, dj, j, freq_of_ngramm_j) -
zv_calc(T, di, j, freq_of_ngramm_j) -
zv_calc(T, dj, i, freq_of_ngramm_i));
}
long double zv_calc(int T,
const std::vector<int> di,
int i,
const std::vector<std::vector<int>> freq_of_ngramm_i_j)
{
long double total = 0.0;
for (size_t m = 1; m <= (size_t)T; ++m)
total += spearman_calc(di, freq_of_ngramm_i_j[(size_t)i - m]);
return total / T;
}
long double spearman_calc(const std::vector<int> di,
const std::vector<int> freq_of_ngramm_i_j)
{
long double total = 0.0;
for (size_t i = 0; i < di.size(); ++i)
total += pow(di[i] - freq_of_ngramm_i_j[i], 2);
total *= 6;
int n = di.size();
long int m = n * (pow(n, 2) - 1);
total /= m;
return 1 - total;
}
|