TextClassifierLib/source/BaseVectorizer.cpp at cpp11 · rvcgeeks/TextClassifierLib

History

179 lines (150 loc) · 3.73 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

/*++

Revision History:

Date: Jun 28, 2024.

Author: Rajas Chavadekar.

Desc: Created.

--*/

#include <iostream>

#include <string>

#include <algorithm>

#include "BaseVectorizer.h"

std::string preprocess_text(const std::string& text) {

std::string processed = text; //.substr(0, MAX_TEXT_LEN);

std::replace(processed.begin(), processed.end(), '\n', ' ');

std::string filtered = "";

for (char c : processed) {

if (std::isalnum(c) || std::isspace(c) || std::ispunct(c)) {

filtered += std::tolower(c);

}

else

{

filtered += ' ';

}

return filtered;

}

/**

* @brief Generate n-grams from a vector of words.

* @param tokens The vector of words (tokens) from which to generate n-grams.

* @param n The size of n-grams to generate.

* @return A vector of n-grams as strings.

vector<string> generateNGrams(const vector<string>& tokens, int n) {

vector<string> ngrams;

if (n <= 0 || tokens.size() < n) {

return ngrams; // Return empty if n is invalid or insufficient tokens

}

for (size_t i = 0; i <= tokens.size() - n; ++i) {

string ngram = tokens[i];

for (int j = 1; j < n; ++j) {

ngram += " " + tokens[i + j];

}

ngrams.push_back(ngram);

}

return ngrams;

}

/**

* @brief Split a sentence into a vector of words.

* @param sentence_ The sentence to split.

* @return Vector of words.

vector<string> BaseVectorizer::buildSentenceVector(string sentence_, bool preprocess)

{

GlobalData vars;

string new_word = "";

vector<string> ret;

if (true == preprocess)

{

sentence_ = preprocess_text(sentence_);

}

for (char x : sentence_)

{

if (isupper(x) && !case_sensitive)

{

x = tolower(x);

}

if (x == ' ')

{

if (!include_stopwords && vars.stopWords.count(new_word))

{

new_word = "";

}

else

{

ret.push_back(new_word);

new_word = "";

}

else if (vars.punctuation.count(x))

{

ret.push_back(new_word);

new_word = x;

ret.push_back(new_word);

new_word = "";

}

else

{

new_word += x;

}

if (new_word != "")

{

ret.push_back(new_word);

}

vector<string> fixed_ret;

for (const auto& s : ret)

{

if (!s.empty())

{

fixed_ret.push_back(s);

}

if (ngrams > 1) {

return generateNGrams(fixed_ret, ngrams);

}

return fixed_ret;

}

void BaseVectorizer::scanForSparseHistogram(std::string abs_filepath_to_features, int minfrequency)

{

ifstream in;

string feature;

vector<string> features;

in.open(abs_filepath_to_features);

if (!in)

{

cout << "ERROR: Cannot open features file.\n";

return;

}

std::unordered_map<std::string, int> histogram2;

while (getline(in, feature))

{

features = buildSentenceVector(feature);

for (const auto& x : features)

{

if (histogram2.count(x) || x.length() == 1)

{

histogram2[x]++;

}

else

{

histogram2[x] = 1;

}

in.close();

for (const auto& entry : histogram2)

{

if (entry.second < minfrequency)

{

histogram[entry.first] = entry.second;

}

std::cout << "No of Rare Words = " << histogram.size() << std::endl;

}

void BaseVectorizer::setVersionInfo(char* vers_info_in)

{

memset(vers_info, 0, sizeof(vers_info));

strcpy(vers_info, vers_info_in);

}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

BaseVectorizer.cpp

Latest commit

History

BaseVectorizer.cpp

File metadata and controls