From ef8a3f6c3e20178ee520f1e6bedbc866e3c9b490 Mon Sep 17 00:00:00 2001
From: scratko <m@scratko.xyz>
Date: Sun, 3 Aug 2025 02:28:24 +0300
Subject: Initial commit: added source code, resources and README

---
 utilities.cpp | 120 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 120 insertions(+)
 create mode 100755 utilities.cpp

(limited to 'utilities.cpp')
diff --git a/utilities.cpp b/utilities.cpp
new file mode 100755
index 0000000..4959ae2
--- /dev/null
+++ b/utilities.cpp
@@ -0,0 +1,120 @@
+#include "utilities.h"
+#include <qdebug.h>
+
+void prepare(std::string& st) {
+//    std::vector<std::string> stop_words{" a ", " an ", " as ", " about ", " above ", " accross ", " after ",
+//                                        " against ", " and ", " are ", " be ", " before ", " along ",
+//                                        " around ", " at ", " behind ", " below ", " beetween ", " but ",
+//                                        " by ", " down ", " for ", " from ", " in ", " is ", " inside ",
+//                                        " into ", " near ", " of ", " off ", " on ", " out ", " outside ",
+//                                        " over ", " so ", " till ", " that ", " the ", " to ", " under ", " until ",
+//                                        " up ", " with ", " while ", " this "};
+
+//    for(std::string& s : stop_words){
+//        size_t pos = 0;
+//        while ((pos = st.find(s, pos)) != std::string::npos)
+//            st.erase(pos, s.size()-1);
+//    }
+
+    std::string::iterator remove = std::remove_if(st.begin(), st.end(), [](char& ch) -> bool {
+        return (isalpha((unsigned char)ch) || ch == ' ') ? false : true;
+    });
+
+    st.erase(std::remove_if(st.begin(), st.end(),
+                            [](char& ch) -> bool {return (isalpha((unsigned char)ch) || ch == ' ') ? false : true;}),
+             st.end());
+
+    st.erase(std::unique(st.begin(), st.end(),
+                         [](char c1, char c2) { return c1 == ' ' && c2 == ' '; }),
+             st.end());
+
+    std::transform(st.begin(), st.end(), st.begin(), tolower);
+}
+
+std::map<std::string, int> getNgramm(const std::string& doc, size_t n) {
+    std::vector<std::string> s;
+    std::map<std::string, int> dict;
+
+    for (size_t i = 0; i < (doc.size() - n + (size_t)1); ++i) {
+        const std::string word(&(doc[i]), &(doc[i + n]));
+        ++dict[word];
+    }
+    return dict;
+}
+
+void n_gram_calc(const std::string& doc, int n)
+{
+    std::map<std::string, int> dict;
+    int min = 2;
+    for (int i = 0; i < n - min + 1; ++i) {
+        std::map<std::string, int> temp_dic(getNgramm(doc, (size_t)min + i));
+        dict.insert(temp_dic.begin(), temp_dic.end());
+    }
+    std::set<std::string> unique;
+    std::ofstream dictionary_file("dictionary.txt");
+    int max = std::max_element(dict.begin(), dict.end(),
+                     [](const std::pair<std::string, int> a,
+                     const std::pair<std::string, int> b){return a.second < b.second;})->second;
+
+
+    for (std::map<std::string, int>::iterator it = dict.begin(); it != dict.end(); ++it) {
+        if (it->second > 1000)
+            unique.insert(it->first);
+    }
+    for (std::string str : unique)
+        dictionary_file << str << '\n';
+    dictionary_file.close();
+}
+
+std::vector<int> freq_in_chunk(const std::string& chunk, const std::vector<std::string>& dictionary)
+{
+    std::vector<int> freq_in_chunks;
+    int quantity = 0;
+    for (std::vector<std::string>::const_iterator it = dictionary.begin(); it != dictionary.end(); ++it) {
+        size_t pos = 0;
+        while ((pos = chunk.find(*it, pos)) != std::string::npos) {
+            ++pos;
+            ++quantity;
+        }
+        freq_in_chunks.push_back(quantity);
+        quantity = 0;
+    }
+    return freq_in_chunks;
+}
+
+long double dzv_calc(int T,
+                     const std::vector<int> di,
+                     const std::vector<int> dj,
+                     int i, int j,
+                     const std::vector<std::vector<int>> freq_of_ngramm_i,
+                     const std::vector<std::vector<int>> freq_of_ngramm_j)
+{
+    return std::abs(zv_calc(T, di, i, freq_of_ngramm_i) +
+                    zv_calc(T, dj, j, freq_of_ngramm_j) -
+                    zv_calc(T, di, j, freq_of_ngramm_j) -
+                    zv_calc(T, dj, i, freq_of_ngramm_i));
+}
+
+long double zv_calc(int T,
+                    const std::vector<int> di,
+                    int i,
+                    const std::vector<std::vector<int>> freq_of_ngramm_i_j)
+{
+    long double total = 0.0;
+    for (size_t m = 1; m <= (size_t)T; ++m)
+        total += spearman_calc(di, freq_of_ngramm_i_j[(size_t)i - m]);
+    return total / T;
+}
+
+long double spearman_calc(const std::vector<int> di,
+                          const std::vector<int> freq_of_ngramm_i_j)
+{
+    long double total = 0.0;
+    for (size_t i = 0; i < di.size(); ++i)
+        total += pow(di[i] - freq_of_ngramm_i_j[i], 2);
+    total *= 6;
+    int n = di.size();
+    long int m = n * (pow(n, 2) - 1);
+    total /= m;
+    return 1 - total;
+}
-- 
cgit v1.2.3