C++ 离群点检测
ai.hpp
#ifndef ENGINE_AI_N_H
#define ENGINE_AI_N_H
#pragma once
#endif
#include <vector>
#include <cmath>
#include <string>
#include <list>
#include <unordered_map>
#include <tuple>
#include <algorithm>
#include <memory>
#include <fstream>
#include <sstream>
#include <chrono>
using namespace std;
typedef std::vector<float> Vector;
template <typename T> struct Cvt;
template <> struct Cvt<std::string> {
static const std::string& to_utf8(const std::string& s) { return s; }
static const std::string& from_utf8(const std::string& s) { return s; }
};
namespace v {
struct LightVector: public std::pair<float *, float *>
{
using Parent = std::pair<float *, float *>;
template <typename... Arg> LightVector(Arg&& ... arg): Parent(std::forward<Arg>(arg) ...) {}
float *data() const { return first; }
size_t size() const { return std::distance(first, second); }
bool empty() const { return first == second; }
float& operator[](size_t i) { return *(first + i); }
float operator[](size_t i) const { return *(first + i); }
};
template <class Vector1, class Vector2> inline float dot(const Vector1&x, const Vector2& y) {
int m = x.size(); const float *xd = x.data(), *yd = y.data();
float sum = 0.0;
while (--m >= 0) sum += (*xd++) * (*yd++);
return sum;
}
// saxpy: x = x + g * y; x = a * x + g * y
inline void saxpy(Vector& x, float g, const Vector& y) {
int m = x.size(); float *xd = x.data(); const float *yd = y.data();
while (--m >= 0) (*xd++) += g * (*yd++);
}
inline void unit(Vector& x) {
float len = std::sqrt(dot(x, x));
if (len == 0) return;
int m = x.size(); float *xd = x.data();
while (--m >= 0) (*xd++) /= len;
}
}
namespace ai
{
template <class String = std::string>
struct Word2Vec
{
struct Word
{
int32_t index_;
String text_;
uint32_t count_;
Word *left_, *right_;
std::vector<uint8_t> codes_;
std::vector<uint32_t> points_;
Word(int32_t index, String text, uint32_t count, Word *left = 0, Word *right = 0) : index_(index), text_(text), count_(count), left_(left), right_(right) {}
Word(const Word&) = delete;
const Word& operator = (const Word&) = delete;
};
typedef std::shared_ptr<Word> WordP;
struct Sentence
{
std::vector<Word *> words_;
std::vector<String> tokens_;
};
typedef std::shared_ptr<Sentence> SentenceP;
std::vector<Vector> syn0_, syn1_;
std::vector<Vector> syn0norm_;
std::unordered_map<String, WordP> vocab_;
std::vector<Word *> words_;
int layer1_size_;
int window_;
//subsampling
float sample_;
int min_count_;
float alpha_, min_alpha_;
bool phrase_;
float phrase_threshold_;
Word2Vec(int size = 100, int window = 5, float sample = 0.001, int min_count = 5, int negative = 0, float alpha = 0.025, float min_alpha = 0.0001)
:layer1_size_(size), window_(window), sample_(sample), min_count_(min_count), alpha_(alpha), min_alpha_(min_alpha), phrase_(false), phrase_threshold_(50)
{}
bool has(const String& w) const { return vocab_.find(w) != vocab_.end(); }
int build_vocab(std::vector<SentenceP>& sentences) {
size_t count = 0;
std::unordered_map<String, int> vocab;
for (auto& sentence: sentences) {
++count;
String last_token;
for (auto& token: sentence->tokens_) {
vocab[token] += 1;
// add bigram phrases
if (phrase_) {
if(!last_token.empty()) vocab[last_token + Cvt<String>::from_utf8("_") + token] += 1;
last_token = token;
}
}
}
if (phrase_) {
count = 0;
int total_words = std::accumulate(vocab.begin(), vocab.end(), 0, [](int x, const std::pair<String, int>& v) { return x + v.second; });
std::unordered_map<String, int> phrase_vocab;
for (auto& sentence: sentences) {
++count;
std::vector<String> phrase_tokens;
String last_token;
uint32_t pa = 0, pb = 0, pab = 0;
for (auto& token: sentence->tokens_) {
pb = vocab[token];
if (!last_token.empty()) {
String phrase = last_token + Cvt<String>::from_utf8("_") + token;
pab = vocab[phrase];
float score = 0;
if (pa >= min_count_ && pb >= min_count_ && pab >= min_count_)
score = (pab - min_count_ ) / (float(pa) * pb) * total_words;
if (score > phrase_threshold_) {
phrase_tokens.emplace_back(phrase);
token.clear();
phrase_vocab[phrase] += 1;
}
else {
phrase_tokens.emplace_back(last_token);
phrase_vocab[last_token] += 1;
}
}
last_token = token;
pa = pb;
}
if (!last_token.empty()) {
phrase_tokens.emplace_back(last_token);
phrase_vocab[last_token] += 1;
}
sentence->tokens_.swap(phrase_tokens);
}
vocab.swap(phrase_vocab);
}
int n_words = vocab.size();
if (n_words <= 1) return -1;
words_.reserve(n_words);
auto comp = [](Word *w1, Word *w2) { return w1->count_ > w2->count_; };
for (auto& p: vocab) {
uint32_t count = p.second;
if (count <= min_count_) continue;
auto r = vocab_.emplace(p.first, WordP(new Word{0, p.first, count}));
auto it = std::find(words_.begin(), words_.end(), r.first->second.get());
if (it != words_.end()) {
continue;
} else {
words_.emplace_back((r.first->second.get()));
}
}
std::sort(words_.begin(), words_.end(), comp);
int index = 0;
for (auto& w: words_) w->index_ = index++;
n_words = words_.size();
std::vector<Word *> heap = words_;
std::make_heap(heap.begin(), heap.end(), comp);
std::vector<WordP> tmp;
for (int i=0; i<n_words-1; ++i) {
std::pop_heap(heap.begin(), heap.end(), comp);
auto min1 = heap.back(); heap.pop_back();
std::pop_heap(heap.begin(), heap.end(), comp);
auto min2 = heap.back(); heap.pop_back();
tmp.emplace_back(WordP(new Word{i + n_words, Cvt<String>::from_utf8(""), min1->count_ + min2->count_, min1, min2}));
heap.emplace_back(tmp.back().get());
std::push_heap(heap.begin(), heap.end(), comp);
}
int max_depth = 0;
std::list<std::tuple<Word *, std::vector<uint32_t>, std::vector<uint8_t>>> stack;
stack.emplace_back(std::make_tuple(heap[0], std::vector<uint32_t>(), std::vector<uint8_t>()));
count = 0;
while (!stack.empty()) {
auto t = stack.back();
stack.pop_back();
Word *word = std::get<0>(t);
if (word->index_ < n_words) {
word->points_ = std::get<1>(t);
word->codes_ = std::get<2>(t);
max_depth = std::max((int)word->codes_.size(), max_depth);
}
else {
auto points = std::get<1>(t);
points.emplace_back(word->index_ - n_words);
auto codes1 = std::get<2>(t);
auto codes2 = codes1;
codes1.emplace_back(0); codes2.emplace_back(1);
stack.emplace_back(std::make_tuple(word->left_, points, codes1));
stack.emplace_back(std::make_tuple(word->right_, points, codes2));
}
}
syn0_.resize(n_words);
syn1_.resize(n_words);
std::default_random_engine eng(::time(NULL));
std::uniform_real_distribution<float> rng(0.0, 1.0);
for (auto& s: syn0_) {
s.resize(layer1_size_);
for (auto& x: s) x = (rng(eng) - 0.5) / layer1_size_;
}
for (auto& s: syn1_) s.resize(layer1_size_);
return 0;
}
int train(std::vector<SentenceP>& sentences) {
int total_words = std::accumulate(vocab_.begin(), vocab_.end(), 0,
[](int x, const std::pair<String, WordP>& p) { return (int)(x + p.second->count_); });
int current_words = 0;
float alpha0 = alpha_, min_alpha = min_alpha_;
std::default_random_engine eng(::time(NULL));
std::uniform_real_distribution<float> rng(0.0, 1.0);
size_t n_sentences = sentences.size();
size_t last_words = 0;
auto cstart = std::chrono::high_resolution_clock::now();
#pragma omp parallel for
for (size_t i=0; i < n_sentences; ++i) {
auto sentence = sentences[i].get();
if (sentence->tokens_.empty())
continue;
size_t len = sentence->tokens_.size();
for (size_t i=0; i<len; ++i) {
auto it = vocab_.find(sentence->tokens_[i]);
if (it == vocab_.end()) continue;
Word *word = it->second.get();
// subsampling
if (sample_ > 0) {
float rnd = (sqrt(word->count_ / (sample_ * total_words)) + 1) * (sample_ * total_words) / word->count_;
if (rnd < rng(eng)) continue;
}
sentence->words_.emplace_back(it->second.get());
}
float alpha = std::max(min_alpha, float(alpha0 * (1.0 - 1.0 * current_words / total_words)));
Vector work(layer1_size_);
size_t words = train_sentence(*sentence, alpha, work);
#pragma omp atomic
current_words += words;
if (current_words - last_words > 1024 * 100 || i == n_sentences - 1) {
auto cend = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(cend - cstart).count();
// printf("training alpha: %.4f progress: %.2f%% words per sec: %.3fK\n", alpha, current_words * 100.0/total_words, (current_words - last_words) * 1000.0 / duration);
last_words = current_words;
cstart = cend;
}
}
syn0norm_ = syn0_;
for (auto& v: syn0norm_) v::unit(v);
return 0;
}
int save(const std::string& file) const {
std::ofstream out(file, std::ofstream::out);
out << syn0_.size() << " " << syn0_[0].size() << std::endl;
std::vector<Word *> words = words_;
std::sort(words.begin(), words.end(), [](Word *w1, Word *w2) { return w1->count_ > w2->count_; });
for (auto w: words) {
out << Cvt<String>::to_utf8(w->text_);
for (auto i: syn0_[w->index_]) out << " " << i;
out << std::endl;
}
return 0;
}
int load(const std::string& file) {
std::ifstream in(file);
std::string line;
if (! std::getline(in, line)) return -1;
int n_words = 0, layer1_size = 0;
std::istringstream iss(line);
iss >> n_words >> layer1_size;
syn0_.clear(); vocab_.clear(); words_.clear();
syn0_.resize(n_words);
for (int i=0; i<n_words; ++i) {
if (! std::getline(in, line)) return -1;
std::istringstream iss(line);
std::string text;
iss >> text;
auto p = vocab_.emplace(Cvt<String>::from_utf8(text), WordP(new Word{i, Cvt<String>::from_utf8(text), 0}));
words_.emplace_back(p.first->second.get());
syn0_[i].resize(layer1_size);
for(int j=0; j<layer1_size; ++j) {
iss >> syn0_[i][j];
}
}
layer1_size_ = layer1_size;
syn0norm_ = syn0_;
for (auto& v: syn0norm_) v::unit(v);
return 0;
}
std::vector<std::pair<String,float>> most_similar(std::vector<String> positive, std::vector<String> negative, int topn) {
if ((positive.empty() && negative.empty()) || syn0norm_.empty()) return std::vector<std::pair<String,float>>{};
Vector mean(layer1_size_);
std::vector<int> all_words;
auto add_word = [&mean, &all_words, this](const String& w, float weight) {
auto it = vocab_.find(w);
if (it == vocab_.end()) return;
Word& word = *it->second;
v::saxpy(mean, weight, syn0norm_[word.index_]);
all_words.push_back(word.index_);
};
for (auto& w: positive) add_word(w, 1.0);
for (auto& w: negative) add_word(w, -1.0);
v::unit(mean);
Vector dists;
std::vector<int> indexes;
int i=0;
dists.reserve(syn0norm_.size());
indexes.reserve(syn0norm_.size());
for (auto &x: syn0norm_) {
dists.push_back(v::dot(x, mean));
indexes.push_back(i++);
}
auto comp = [&dists](int i, int j) { return dists[i] > dists[j]; };
int k = std::min(int(topn+all_words.size()), int(indexes.size())-1);
auto first = indexes.begin(), last = indexes.begin() + k, end = indexes.end();
std::make_heap(first, last + 1, comp);
std::pop_heap(first, last + 1, comp);
for (auto it = last + 1; it != end; ++it) {
if (! comp(*it, *first)) continue;
*last = *it;
std::pop_heap(first, last + 1, comp);
}
std::sort_heap(first, last, comp);
std::vector<std::pair<String,float>> results;
for(int i=0, j=0; i<k; ++i) {
if (std::find(all_words.begin(), all_words.end(), indexes[i]) != all_words.end())
continue;
results.push_back(std::make_pair(words_[indexes[i]]->text_, dists[indexes[i]]));
if (++j >= topn) break;
}
return results;
}
const Vector& word_vector(const String& w) const {
static Vector nil;
auto it = vocab_.find(w);
if (it == vocab_.end()) return nil;
return syn0_[it->second->index_];
}
size_t word_vector_size() const { return layer1_size_; }
const Vector calculate_words_average(const Vector& word_vec1, const Vector& word_vec2) {
int size = word_vec1.size();
if (size != word_vec2.size()) {
if (word_vec1.empty()) return word_vec2;
return word_vec1;
}
Vector result(size);
for (size_t i = 0; i < size; ++i) result[i] = (word_vec1[i] + word_vec2[i]) / 2.0;
return result;
}
private:
int train_sentence(Sentence& sentence, float alpha, Vector& work) {
const int max_size = 1000;
const float max_exp = 6.0;
const static std::vector<float> table = [&](){
std::vector<float> x(max_size);
for (size_t i=0; i<max_size; ++i) { float f = exp( (i / float(max_size) * 2 -1) * max_exp); x[i] = f / (f + 1); }
return x;
}();
int count = 0;
int len = sentence.words_.size();
int reduced_window = rand() % window_;
for (int i=0; i<len; ++i) {
const Word& current = *sentence.words_[i];
size_t codelen = current.codes_.size();
int j = std::max(0, i - window_ + reduced_window);
int k = std::min(len, i + window_ + 1 - reduced_window);
for (; j < k; ++j) {
const Word *word = sentence.words_[j];
if (j == i || word->codes_.empty())
continue;
int word_index = word->index_;
auto& l1 = syn0_[word_index];
std::fill(work.begin(), work.end(), 0);
for (size_t b=0; b<codelen; ++b) {
int idx = current.points_[b];
auto& l2 = syn1_[idx];
float f = v::dot(l1, l2);
if (f <= -max_exp || f >= max_exp)
continue;
int fi = int((f + max_exp) * (max_size / max_exp / 2.0));
f = table[fi];
float g = (1 - current.codes_[b] - f) * alpha;
v::saxpy(work, g, l2);
v::saxpy(l2, g, l1);
}
v::saxpy(l1, 1.0, work);
}
++count;
}
return count;
}
float similarity(const String& w1, const String& w2) const {
auto it1 = vocab_.find(w1), it2 = vocab_.find(w2);
if (it1 != vocab_.end() && it2 != vocab_.end())
return v::dot(syn0_[it1->second->index_], syn0_[it2->second->index_]);
return 0;
}
};
template <class Double = std::double_t>
struct DBSCAN
{
struct Model {
Vector features; // 特征向量
int label; // 类别标签
};
Double epsilon;
size_t minPts;
DBSCAN(Double epsilon = 0.6, size_t minPts = 1, size_t knn_k = 4) :epsilon(epsilon), minPts(minPts) {}
std::vector<int> dbscan(const std::vector<Vector>& sentences) {
int n = sentences.size();
std::vector<int> labels(n, -1);
int cluster = 0;
for (int i = 0; i < n; ++i) {
if (labels[i] != -1) continue;
std::vector<int> neighbors;
for (int j = i + 1; j < n; ++j) {
Double distance = calculate_distance(sentences[i], sentences[j]);
if (distance <= epsilon) neighbors.emplace_back(j);
}
int size = neighbors.size();
if (size < minPts) {
labels[i] = 0;
} else {
++cluster;
labels[i] = cluster;
for (int j = 0; j < size; ++j) {
int neighborIndex = neighbors[j];
if (labels[neighborIndex] == 0) {
labels[neighborIndex] = cluster;
}
if (labels[neighborIndex] != -1) continue;
labels[neighborIndex] = cluster;
std::vector<int> newNeighbors;
for (int k = neighborIndex + 1; k < n; ++k) {
double distance = calculate_distance(sentences[neighborIndex], sentences[k]);
if (distance <= epsilon) newNeighbors.emplace_back(k);
}
if (newNeighbors.size() >= minPts) neighbors.insert(neighbors.end(), newNeighbors.begin(), newNeighbors.end());
}
}
}
return labels;
}
int predict(const std::vector<Model>& model, const Vector& sentence) {
int n = model.size();
for (int i = 0; i < n; ++i) {
double distance = calculate_distance(model[i].features, sentence);
if (distance <= epsilon-2) {
if (model[i].label != 0) {
return model[i].label;
}
}
}
return 0;
}
std::vector<Model> create_model(const std::vector<Vector>& baseline, const std::vector<int>& labels) {
std::vector<Model> model;
size_t numSamples = std::min(baseline.size(), labels.size());
for (size_t i = 0; i < numSamples; ++i) {
Model m;
m.features = baseline[i];
m.label = labels[i];
model.emplace_back(m);
}
return model;
}
~DBSCAN() {}
private:
Double calculate_distance(const Vector& v1, const Vector& v2) {
Double distance = 0.0;
int size = v1.size();
for (int i = 0; i < size; ++i) {
Double diff = v1[i] - v2[i];
distance += std::abs(diff);
}
return distance;
}
};
} // namespace ai
main.cpp
// OpenMP is required..
// g++-4.8 -ow2v -fopenmp -std=c++0x -Ofast -march=native -funroll-loops main.cpp -lpthread
#include "ai.hpp"
#include <fstream>
#include <iostream>
#include <sstream>
#include <list>
#include <set>
#define MODEL_SIZE 1000
std::vector<ai::Word2Vec<std::string>::SentenceP> get_sentences (const std::string& filename) {
using Sentence = ai::Word2Vec<std::string>::Sentence;
using SentenceP = ai::Word2Vec<std::string>::SentenceP;
SentenceP sentence(new Sentence);
std::ifstream ifs;
ifs.open(filename, std::ios::in);
std::string line;
std::list<SentenceP> sentence_list;
while (std::getline(ifs, line)) {
std::string token;
std::string before_token;
std::istringstream iss(line);
while (iss >> token) {
sentence->tokens_.push_back(std::move(token));
}
sentence_list.push_back(std::move(sentence));
sentence.reset(new Sentence);
}
if (!sentence->tokens_.empty()) sentence_list.push_back(std::move(sentence));
std::vector<SentenceP> sentences(sentence_list.begin(), sentence_list.end());
return sentences;
}
int train_w2v(ai::Word2Vec<std::string> &w2v, std::vector<ai::Word2Vec<std::string>::SentenceP>& sentences) {
if (w2v.build_vocab(sentences) != 0) return -1;
if (w2v.train(sentences) != 0) return -1;
if (w2v.save("word2vec") != 0) return -1;
return 0;
}
std::vector<Vector> get_sentences_vector(ai::Word2Vec<std::string> &w2v, std::vector<ai::Word2Vec<std::string>::SentenceP>& sentences) {
std::vector<Vector> sentences_vector;
for (auto sentence = sentences.begin(); sentence != sentences.end(); ++sentence) {
Vector sentence_vector;
sentence_vector.reserve((*sentence)->tokens_.size());
for (auto token = (*sentence)->tokens_.begin(); token != (*sentence)->tokens_.end(); ++token)
if (w2v.has(*token)) sentence_vector = w2v.calculate_words_average(sentence_vector, w2v.word_vector(*token));
if (!sentence_vector.empty()) sentences_vector.push_back(std::move(sentence_vector));
}
return sentences_vector;
}
int main(int argc, const char *argv[])
{
std::shared_ptr<ai::Word2Vec<std::string>> w2v = std::make_shared<ai::Word2Vec<std::string>>(32);
w2v->sample_ = 0;
w2v->min_count_= 1;
// w2v->window_ = 10;
std::vector<ai::Word2Vec<std::string>::SentenceP> sentences = get_sentences("sentences.txt");
if (train_w2v((*w2v), sentences) != 0) return -1;
std::vector<Vector> sentences2vector = get_sentences_vector((*w2v), sentences);
std::cout << "sentence vector size:" << sentences2vector.size() << std::endl;
ai::DBSCAN<std::double_t> dbscan;
dbscan.epsilon = 5.03;
dbscan.minPts = 3;
std::vector<Vector> second_data;
using second_Sentence = ai::Word2Vec<std::string>::Sentence;
using second_SentenceP = ai::Word2Vec<std::string>::SentenceP;
second_SentenceP sentence(new second_Sentence);
std::vector<second_SentenceP> second_sentences_;
int num = 0;
size_t size1 = sentences2vector.size();
for (int i = 0;i < 50; i+=2) {
std::vector<Vector> baseline(sentences2vector.begin() + i*MODEL_SIZE, sentences2vector.begin() + (i+1)*MODEL_SIZE);
std::vector<int> labels = dbscan.dbscan(baseline);
std::set<int> labels_set(labels.begin(), labels.end());
std::vector<Vector> average_baseline;
std::unordered_map<int, Vector> baseline_map;
for (size_t i = 0; i < labels.size(); i++) {
baseline_map[labels[i]] = w2v->calculate_words_average(baseline_map[labels[i]], baseline[i]);
}
std::vector<int> new_labels;
std::vector<Vector> new_baseline;
for (const auto& pair : baseline_map) {
new_labels.push_back(pair.first);
new_baseline.push_back(pair.second);
}
std::vector<ai::DBSCAN<std::double_t>::Model> model = dbscan.create_model(baseline, labels);
// std::vector<ai::DBSCAN<std::double_t>::Model> model = dbscan.create_model(new_baseline, new_labels);
Vector before_log;
for (int j = 0; j < MODEL_SIZE; j++) {
if (j+(i+1)*MODEL_SIZE == size1) goto over;
Vector log = sentences2vector.at(j+(i+1)*MODEL_SIZE);
if (!dbscan.predict(model, log)) {
second_data.push_back(log);
second_sentences_.push_back(sentences[j+(i+1)*MODEL_SIZE]);
num++;
std::cout << "第一次检测的离群点: ";
const ai::Word2Vec<std::string>::SentenceP& sentence = sentences[j+(i+1)*MODEL_SIZE];
const ai::Word2Vec<std::string>::Sentence& tokens = *sentence;
for (const std::string& token : tokens.tokens_) std::cout << token << " ";
std::cout << std::endl;
}
}
std::cout << std::endl;
}
over:
std::cout << "num:" << num << std::endl;
std::vector<ai::Word2Vec<std::string>::SentenceP> second_sentences(second_sentences_.begin(), second_sentences_.end());
num = 0;
std::vector<int> labels = dbscan.dbscan(second_data);
for(int i = 0; i < labels.size(); i++) {
if (labels[i] == 0) {
num++;
std::cout << "第二次检测的离群点: ";
const ai::Word2Vec<std::string>::SentenceP& sentence = second_sentences[i];
const ai::Word2Vec<std::string>::Sentence& tokens = *sentence;
for (const std::string& token : tokens.tokens_) std::cout << token << " ";
std::cout << std::endl;
}
}
std::cout << "num:" << num << std::endl;
}
sentences.txt
abc def ghi sdff ghrio oongs
sdgg sdgf jong wongg lllg
jjogg weg egege geron osdg lsdg
...