utilities.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120

#include "utilities.h"
#include <qdebug.h>

void prepare(std::string& st) {
//    std::vector<std::string> stop_words{" a ", " an ", " as ", " about ", " above ", " accross ", " after ",
//                                        " against ", " and ", " are ", " be ", " before ", " along ",
//                                        " around ", " at ", " behind ", " below ", " beetween ", " but ",
//                                        " by ", " down ", " for ", " from ", " in ", " is ", " inside ",
//                                        " into ", " near ", " of ", " off ", " on ", " out ", " outside ",
//                                        " over ", " so ", " till ", " that ", " the ", " to ", " under ", " until ",
//                                        " up ", " with ", " while ", " this "};

//    for(std::string& s : stop_words){
//        size_t pos = 0;
//        while ((pos = st.find(s, pos)) != std::string::npos)
//            st.erase(pos, s.size()-1);
//    }

    std::string::iterator remove = std::remove_if(st.begin(), st.end(), [](char& ch) -> bool {
        return (isalpha((unsigned char)ch) || ch == ' ') ? false : true;
    });

    st.erase(std::remove_if(st.begin(), st.end(),
                            [](char& ch) -> bool {return (isalpha((unsigned char)ch) || ch == ' ') ? false : true;}),
             st.end());

    st.erase(std::unique(st.begin(), st.end(),
                         [](char c1, char c2) { return c1 == ' ' && c2 == ' '; }),
             st.end());

    std::transform(st.begin(), st.end(), st.begin(), tolower);
}

std::map<std::string, int> getNgramm(const std::string& doc, size_t n) {
    std::vector<std::string> s;
    std::map<std::string, int> dict;

    for (size_t i = 0; i < (doc.size() - n + (size_t)1); ++i) {
        const std::string word(&(doc[i]), &(doc[i + n]));
        ++dict[word];
    }
    return dict;
}

void n_gram_calc(const std::string& doc, int n)
{
    std::map<std::string, int> dict;
    int min = 2;
    for (int i = 0; i < n - min + 1; ++i) {
        std::map<std::string, int> temp_dic(getNgramm(doc, (size_t)min + i));
        dict.insert(temp_dic.begin(), temp_dic.end());
    }
    std::set<std::string> unique;
    std::ofstream dictionary_file("dictionary.txt");
    int max = std::max_element(dict.begin(), dict.end(),
                     [](const std::pair<std::string, int> a,
                     const std::pair<std::string, int> b){return a.second < b.second;})->second;


    for (std::map<std::string, int>::iterator it = dict.begin(); it != dict.end(); ++it) {
        if (it->second > 1000)
            unique.insert(it->first);
    }
    for (std::string str : unique)
        dictionary_file << str << '\n';
    dictionary_file.close();
}

std::vector<int> freq_in_chunk(const std::string& chunk, const std::vector<std::string>& dictionary)
{
    std::vector<int> freq_in_chunks;
    int quantity = 0;
    for (std::vector<std::string>::const_iterator it = dictionary.begin(); it != dictionary.end(); ++it) {
        size_t pos = 0;
        while ((pos = chunk.find(*it, pos)) != std::string::npos) {
            ++pos;
            ++quantity;
        }
        freq_in_chunks.push_back(quantity);
        quantity = 0;
    }
    return freq_in_chunks;
}

long double dzv_calc(int T,
                     const std::vector<int> di,
                     const std::vector<int> dj,
                     int i, int j,
                     const std::vector<std::vector<int>> freq_of_ngramm_i,
                     const std::vector<std::vector<int>> freq_of_ngramm_j)
{
    return std::abs(zv_calc(T, di, i, freq_of_ngramm_i) +
                    zv_calc(T, dj, j, freq_of_ngramm_j) -
                    zv_calc(T, di, j, freq_of_ngramm_j) -
                    zv_calc(T, dj, i, freq_of_ngramm_i));
}

long double zv_calc(int T,
                    const std::vector<int> di,
                    int i,
                    const std::vector<std::vector<int>> freq_of_ngramm_i_j)
{
    long double total = 0.0;
    for (size_t m = 1; m <= (size_t)T; ++m)
        total += spearman_calc(di, freq_of_ngramm_i_j[(size_t)i - m]);
    return total / T;
}

long double spearman_calc(const std::vector<int> di,
                          const std::vector<int> freq_of_ngramm_i_j)
{
    long double total = 0.0;
    for (size_t i = 0; i < di.size(); ++i)
        total += pow(di[i] - freq_of_ngramm_i_j[i], 2);
    total *= 6;
    int n = di.size();
    long int m = n * (pow(n, 2) - 1);
    total /= m;
    return 1 - total;
}