#include "utilities.h" #include void prepare(std::string& st) { // std::vector stop_words{" a ", " an ", " as ", " about ", " above ", " accross ", " after ", // " against ", " and ", " are ", " be ", " before ", " along ", // " around ", " at ", " behind ", " below ", " beetween ", " but ", // " by ", " down ", " for ", " from ", " in ", " is ", " inside ", // " into ", " near ", " of ", " off ", " on ", " out ", " outside ", // " over ", " so ", " till ", " that ", " the ", " to ", " under ", " until ", // " up ", " with ", " while ", " this "}; // for(std::string& s : stop_words){ // size_t pos = 0; // while ((pos = st.find(s, pos)) != std::string::npos) // st.erase(pos, s.size()-1); // } std::string::iterator remove = std::remove_if(st.begin(), st.end(), [](char& ch) -> bool { return (isalpha((unsigned char)ch) || ch == ' ') ? false : true; }); st.erase(std::remove_if(st.begin(), st.end(), [](char& ch) -> bool {return (isalpha((unsigned char)ch) || ch == ' ') ? false : true;}), st.end()); st.erase(std::unique(st.begin(), st.end(), [](char c1, char c2) { return c1 == ' ' && c2 == ' '; }), st.end()); std::transform(st.begin(), st.end(), st.begin(), tolower); } std::map getNgramm(const std::string& doc, size_t n) { std::vector s; std::map dict; for (size_t i = 0; i < (doc.size() - n + (size_t)1); ++i) { const std::string word(&(doc[i]), &(doc[i + n])); ++dict[word]; } return dict; } void n_gram_calc(const std::string& doc, int n) { std::map dict; int min = 2; for (int i = 0; i < n - min + 1; ++i) { std::map temp_dic(getNgramm(doc, (size_t)min + i)); dict.insert(temp_dic.begin(), temp_dic.end()); } std::set unique; std::ofstream dictionary_file("dictionary.txt"); int max = std::max_element(dict.begin(), dict.end(), [](const std::pair a, const std::pair b){return a.second < b.second;})->second; for (std::map::iterator it = dict.begin(); it != dict.end(); ++it) { if (it->second > 1000) unique.insert(it->first); } for (std::string str : unique) dictionary_file << str << '\n'; dictionary_file.close(); } std::vector freq_in_chunk(const std::string& chunk, const std::vector& dictionary) { std::vector freq_in_chunks; int quantity = 0; for (std::vector::const_iterator it = dictionary.begin(); it != dictionary.end(); ++it) { size_t pos = 0; while ((pos = chunk.find(*it, pos)) != std::string::npos) { ++pos; ++quantity; } freq_in_chunks.push_back(quantity); quantity = 0; } return freq_in_chunks; } long double dzv_calc(int T, const std::vector di, const std::vector dj, int i, int j, const std::vector> freq_of_ngramm_i, const std::vector> freq_of_ngramm_j) { return std::abs(zv_calc(T, di, i, freq_of_ngramm_i) + zv_calc(T, dj, j, freq_of_ngramm_j) - zv_calc(T, di, j, freq_of_ngramm_j) - zv_calc(T, dj, i, freq_of_ngramm_i)); } long double zv_calc(int T, const std::vector di, int i, const std::vector> freq_of_ngramm_i_j) { long double total = 0.0; for (size_t m = 1; m <= (size_t)T; ++m) total += spearman_calc(di, freq_of_ngramm_i_j[(size_t)i - m]); return total / T; } long double spearman_calc(const std::vector di, const std::vector freq_of_ngramm_i_j) { long double total = 0.0; for (size_t i = 0; i < di.size(); ++i) total += pow(di[i] - freq_of_ngramm_i_j[i], 2); total *= 6; int n = di.size(); long int m = n * (pow(n, 2) - 1); total /= m; return 1 - total; }