From ef8a3f6c3e20178ee520f1e6bedbc866e3c9b490 Mon Sep 17 00:00:00 2001 From: scratko Date: Sun, 3 Aug 2025 02:28:24 +0300 Subject: Initial commit: added source code, resources and README --- utilities.cpp | 120 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100755 utilities.cpp (limited to 'utilities.cpp') diff --git a/utilities.cpp b/utilities.cpp new file mode 100755 index 0000000..4959ae2 --- /dev/null +++ b/utilities.cpp @@ -0,0 +1,120 @@ +#include "utilities.h" +#include + +void prepare(std::string& st) { +// std::vector stop_words{" a ", " an ", " as ", " about ", " above ", " accross ", " after ", +// " against ", " and ", " are ", " be ", " before ", " along ", +// " around ", " at ", " behind ", " below ", " beetween ", " but ", +// " by ", " down ", " for ", " from ", " in ", " is ", " inside ", +// " into ", " near ", " of ", " off ", " on ", " out ", " outside ", +// " over ", " so ", " till ", " that ", " the ", " to ", " under ", " until ", +// " up ", " with ", " while ", " this "}; + +// for(std::string& s : stop_words){ +// size_t pos = 0; +// while ((pos = st.find(s, pos)) != std::string::npos) +// st.erase(pos, s.size()-1); +// } + + std::string::iterator remove = std::remove_if(st.begin(), st.end(), [](char& ch) -> bool { + return (isalpha((unsigned char)ch) || ch == ' ') ? false : true; + }); + + st.erase(std::remove_if(st.begin(), st.end(), + [](char& ch) -> bool {return (isalpha((unsigned char)ch) || ch == ' ') ? false : true;}), + st.end()); + + st.erase(std::unique(st.begin(), st.end(), + [](char c1, char c2) { return c1 == ' ' && c2 == ' '; }), + st.end()); + + std::transform(st.begin(), st.end(), st.begin(), tolower); +} + +std::map getNgramm(const std::string& doc, size_t n) { + std::vector s; + std::map dict; + + for (size_t i = 0; i < (doc.size() - n + (size_t)1); ++i) { + const std::string word(&(doc[i]), &(doc[i + n])); + ++dict[word]; + } + return dict; +} + +void n_gram_calc(const std::string& doc, int n) +{ + std::map dict; + int min = 2; + for (int i = 0; i < n - min + 1; ++i) { + std::map temp_dic(getNgramm(doc, (size_t)min + i)); + dict.insert(temp_dic.begin(), temp_dic.end()); + } + std::set unique; + std::ofstream dictionary_file("dictionary.txt"); + int max = std::max_element(dict.begin(), dict.end(), + [](const std::pair a, + const std::pair b){return a.second < b.second;})->second; + + + for (std::map::iterator it = dict.begin(); it != dict.end(); ++it) { + if (it->second > 1000) + unique.insert(it->first); + } + for (std::string str : unique) + dictionary_file << str << '\n'; + dictionary_file.close(); +} + +std::vector freq_in_chunk(const std::string& chunk, const std::vector& dictionary) +{ + std::vector freq_in_chunks; + int quantity = 0; + for (std::vector::const_iterator it = dictionary.begin(); it != dictionary.end(); ++it) { + size_t pos = 0; + while ((pos = chunk.find(*it, pos)) != std::string::npos) { + ++pos; + ++quantity; + } + freq_in_chunks.push_back(quantity); + quantity = 0; + } + return freq_in_chunks; +} + +long double dzv_calc(int T, + const std::vector di, + const std::vector dj, + int i, int j, + const std::vector> freq_of_ngramm_i, + const std::vector> freq_of_ngramm_j) +{ + return std::abs(zv_calc(T, di, i, freq_of_ngramm_i) + + zv_calc(T, dj, j, freq_of_ngramm_j) - + zv_calc(T, di, j, freq_of_ngramm_j) - + zv_calc(T, dj, i, freq_of_ngramm_i)); +} + +long double zv_calc(int T, + const std::vector di, + int i, + const std::vector> freq_of_ngramm_i_j) +{ + long double total = 0.0; + for (size_t m = 1; m <= (size_t)T; ++m) + total += spearman_calc(di, freq_of_ngramm_i_j[(size_t)i - m]); + return total / T; +} + +long double spearman_calc(const std::vector di, + const std::vector freq_of_ngramm_i_j) +{ + long double total = 0.0; + for (size_t i = 0; i < di.size(); ++i) + total += pow(di[i] - freq_of_ngramm_i_j[i], 2); + total *= 6; + int n = di.size(); + long int m = n * (pow(n, 2) - 1); + total /= m; + return 1 - total; +} -- cgit v1.2.3