|
通用kmeans算法c++_OOP实现与python可视化输出数据_注释版
- #include <iostream>
- #include <fstream>
- #include "zennze/kmeans_oop.hpp"
- using namespace std;
- using std::cin;
- using std::cout;
- using std::initializer_list;
- using std::runtime_error;
- // 通用kmeans算法c++_OOP实现与python可视化输出数据_注释版
- class NDimenPoint : public VirtualPoint
- {
- private:
- int dimension; // 多维点类的维度
- vector<double> xs; // 一个点的数据
- public:
- // 构造一个点 只要维度
- NDimenPoint(const int d) : dimension(d) { xs.resize(d); }
- // 构造一个点 维度与点数据
- NDimenPoint(const int d, vector<double> l) : dimension(d), xs(l){}
- // 构造一个新点 用另外一个点
- NDimenPoint(const NDimenPoint &p) : dimension(p.dimension), xs(p.xs) {}
- ~NDimenPoint(){};
- bool operator==(const VirtualPoint &p) override
- {
- // 类型转换:父类 转 子类
- auto pp = static_cast<const NDimenPoint &>(p);
- if (dimension != pp.dimension)
- return false;
- for (size_t i = 0; i < xs.size(); i++)
- if (xs[i] != pp.xs[i])
- return false;
- return true;
- }
- bool operator!=(const VirtualPoint &p) override
- {
- auto pp = static_cast<const NDimenPoint &>(p);
- if (dimension != pp.dimension)
- return true;
- for (size_t i = 0; i < xs.size(); i++)
- if (xs[i] != pp.xs[i])
- return true;
- return false;
- }
- void add(const NDimenPoint &p)
- {
- // 定义 点的 加法
- if (p.dimension != dimension)
- throw runtime_error("dimension mismatch");
- for (size_t i = 0; i < xs.size(); i++)
- xs[i] += p.xs[i];
- }
- NDimenPoint operator/(const int n)
- {
- // 定义 点除以一个n的操作
- if (n == 0)
- throw std::runtime_error("divisor zero error!");
- NDimenPoint res(dimension);
- for (size_t i = 0; i < dimension; i++)
- {
- res.xs[i] = xs[i] / n;
- }
- return res;
- }
- double disTo(const NDimenPoint &p)
- {
- // 定义 两点之间的欧式距离 这样支持了多维数据了
- double tmp = 0;
- for (size_t i = 0; i < dimension; i++)
- tmp += pow(xs[i] - p.xs[i], 2);
- return sqrt(tmp);
- }
- string toString() override
- {
- stringstream ss;
- ss << "[";
- for (size_t i = 0; i < dimension; i++)
- {
- if (i > 0)
- ss << ", ";
- ss << xs[i];
- }
- ss << "]";
- return ss.str();
- }
- static double calcDisToCluster(const VirtualPoint &p, const Cluster &c)
- {
- // 静态方法: 点 到 簇质心的距离
- auto pp = static_cast<const NDimenPoint &>(p);
- auto cp = static_cast<const NDimenPoint &>(*(c.getCentroid()));
- // 本质还是: 点到点 之间的距离
- return pp.disTo(cp);
- }
- static sharedVPoint avgPoints(const vector<sharedVPoint> &points)
- {
- // 计算一堆点集合的质心
- if (points.size() <= 0)
- return nullptr;
- NDimenPoint resPoint(static_cast<const NDimenPoint &>(*points[0]).dimension);
- for (auto &&p : points)
- resPoint.add(static_cast<const NDimenPoint &>(*p));
- // 求和 再 除以n 均值
- resPoint = resPoint / points.size();
- // cerr << "DEBUG\t" << resPoint.toString() << ", POINTS.SIZE " << points.size() << endl;
- return make_shared<NDimenPoint>(resPoint);
- };
- };
- vector<NDimenPoint> geneData(int num, const int dimension, double maxVal = 1000)
- {
- std::default_random_engine generator(time(NULL));
- std::uniform_real_distribution<double> distribution(0, maxVal);
- vector<NDimenPoint> points;
- for (size_t i = 0; i < num; i++)
- {
- vector<double> tmpVec;
- for (size_t j = 0; j < dimension; j++)
- tmpVec.push_back(distribution(generator));
- points.push_back(NDimenPoint(dimension, tmpVec));
- }
- return points;
- }
- void output(const vector<Cluster> &clusters, const int dimension)
- {
- cout << "{"
- << ""dimension":" << dimension << "," << endl
- << ""clusters":[";
- for (int i = 0; i < clusters.size(); i++)
- {
- if (i > 0)
- cout << ", ";
- std::cout << clusters[i].toString() << std::endl;
- }
- cout << "]}" << endl;
- }
- void output_json(const vector<Cluster> &clusters, const int dimension)
- {
- std::string file_path = "./kmeans_visualization_py.json";
- std::ofstream write_out_f;
- write_out_f.open(file_path);
- write_out_f << "{"
- << ""dimension":" << dimension << "," << endl
- << ""clusters":[";
- for (int i = 0; i < clusters.size(); i++)
- {
- if (i > 0)
- write_out_f << ", ";
- write_out_f << clusters[i].toString() << std::endl;
- }
- write_out_f << "]}" << endl;
- write_out_f.close();
- }
- void kmeans_work()
- {
- const int maxRound = 1000;
- const int pointCnt = 150; // 数据集的点数
- int dimension = 1; // 点的维度
- int k = 0;
- cerr << "dimension, k: ";
- cin >> dimension >> k;
- vector<sharedVPoint> points; // 点集的 共享指针
- for (auto &&p : geneData(pointCnt, dimension))
- points.push_back(make_shared<NDimenPoint>(p));
-
- auto clusters = KmeansAlg::run(points, k, NDimenPoint::calcDisToCluster, NDimenPoint::avgPoints, maxRound);
-
- output_json(clusters, dimension);
- output(clusters, dimension);
- }
- int main()
- {
- std::cout << "kmeans算法实现!" << endl;
- kmeans_work();
- return 0;
- }
复制代码
- #include <algorithm>
- #include <cmath>
- #include <ctime>
- #include <exception>
- #include <iostream>
- #include <memory>
- #include <random>
- #include <sstream>
- #include <string>
- #include <vector>
- using std::cerr;
- using std::endl;
- using std::make_shared;
- using std::pow;
- using std::shared_ptr;
- using std::sqrt;
- using std::string;
- using std::stringstream;
- using std::to_string;
- using std::vector;
- /**
- * kmeans - 点作为数据,cluster是点的聚簇
- * BEGIN
- * 选出来 k 个点作为中心点生成聚簇
- * 循环
- * 计算点与聚簇的距离
- * 每个点加入到距离最近的聚簇中
- * 更新聚簇中心点
- * 聚簇中心点未变?退出
- * 输出聚簇
- * END
- *
- * 数据结构
- * 点 - ==() toString()
- * 聚簇 - 计算中心点()
- * calcDis(point cluster)
- * kmeans() -
- * 为了设计出更为通用的结构,选择采用OOP面向对象设计,结构比较复杂,尤其是距离计算,求质心这两个函数
- * VirtualPoint - 虚拟点类(抽象类),无数据成员,定义了 == != 两个纯虚函数
- Cluster - 聚簇类,数据成员: VirtualPoint的集合 和 中心点(VirtualPoint类型)
- 函数成员: 设置质心 更新质心 清空点...
- KmeansAlg - 算法框架,run方法实现了聚类算法,提供必要参数(点之间距离计算,求平均点方法),无需重写算法即可运行
- ------------------
- NDimenPoint - 多维点类,继承VirtualPoint,用来处理多维数据
- * 两个通用类 - 虚拟点与聚簇,实际使用的时候,继承VirtualPoint类
- */
- class VirtualPoint
- {
- private:
- public:
- VirtualPoint() {}
- virtual ~VirtualPoint() {}
- // 纯虚函数
- virtual bool operator==(const VirtualPoint &p) = 0;
- virtual bool operator!=(const VirtualPoint &p) = 0;
- virtual string toString() = 0;
- };
- // 为何用智能指针 因为 簇里 不停的清空点集与add点,所以为了提高效率 直接操作指针
- typedef shared_ptr<VirtualPoint> sharedVPoint;
- // 求平均点的方法也可能是任意的,因此需要作为参数传递给算法(函数指针)
- typedef sharedVPoint avgPointFunc(const vector<sharedVPoint> &);
- class Cluster
- {
- // 簇类:管理 质心与 该簇所有的元素
- private:
- vector<sharedVPoint> points; // 频繁操作点 用指针提高效率
- sharedVPoint centroid; // centroid质心 的点
- avgPointFunc *avgPoints; // 求质心的函数指针
- public:
- Cluster(avgPointFunc avg) { avgPoints = avg; }
- ~Cluster() {}
- Cluster &setCentroid(sharedVPoint p)
- {
- centroid = p;
- // 把质心 放进一堆点里 这是为何? 对结果没有影响 影响输出了
- points.push_back(p);
- return *this;
- }
- bool updateCentroid()
- {
- sharedVPoint tmpPoint = avgPoints(points);
- // 哪种情况 计算出来为 nullptr ? 如果points.size()==0吗?
- if (tmpPoint == nullptr)
- return false;
- bool changed;
- // 质心是否改变 true为改变了
- if (tmpPoint != nullptr && centroid != nullptr)
- changed = (*tmpPoint) != (*centroid); // 计算出来的与原来的一样 才为false
- else
- changed = true;
- centroid = tmpPoint; // 计算出来的质心 更新一下
- return changed;
- }
- void clear() { points.clear(); }
- void addPoint(sharedVPoint p)
- {
- points.push_back(p);
- }
- string toString() const
- {
- stringstream ss;
- if (centroid == nullptr || points.size() == 0){
- // setCentroid()不把质心加入到点集 会影响这里
- return "{}";
- }
-
- // 打印质心 与 该簇的所有点
- ss << "{"centroid": " << centroid->toString() << ","points": [";
- for (int i = 0; i < points.size(); i++)
- {
- if (i > 0)
- ss << ", ";
- ss << points[i]->toString();
- }
- ss << "]}";
- return ss.str();
- }
- // 得到该簇的质心
- sharedVPoint getCentroid() const { return centroid; }
- // 得到该簇的所有的元素
- const vector<sharedVPoint> &getPoints() { return points; }
- };
- // 计算 VirtualPoint 与 Cluster的质心 之间的距离
- // 距离的计算方法 可能是任意的(不仅仅欧式距离),因此需要作为参数传递给算法(函数指针)
- typedef double calcFunc(const VirtualPoint &, const Cluster &);
- class KmeansAlg
- {
- public:
- KmeansAlg() {}
- ~KmeansAlg() {}
- // 生成 k 个 位于 [0, n) 中的不同的随机数, n < 100000000
- static vector<int> randDiffNumbers(int n, int k)
- {
- // 选择随机的k个初始质心
- const int maxn = 100000000;
- vector<int> res;
- if (n <= 0 || n >= maxn)
- throw std::runtime_error("n is less than zero or greater than maxn(100,000,000)");
- for (int i = 0; i < n; i++)
- res.push_back(i);
- random_shuffle(res.begin(), res.end());
- res.resize(k);
- return res;
- }
- static vector<Cluster> run(vector<sharedVPoint> data, int k, calcFunc calcDis, avgPointFunc avgPoints, const int maxRound = 1000)
- {
- if (k <= 1)
- throw std::runtime_error("k is less than 1");
- vector<Cluster> clusters;
-
- for (auto &&i : randDiffNumbers(data.size(), k)){
- // 从data里随机选择k个 作为初始的 质心
- // Cluster(avgPoints)这是构造了一个 簇
- // clusters.size() == k
- // && 是右值引用 & 是左值引用
- clusters.push_back(Cluster(avgPoints).setCentroid(data[i]));
- }
-
- for (int round = 0; round < maxRound; round++)
- {
- // 每次迭代就需要把簇的点集清空 因为都要重新计算
- for (auto &&c : clusters)
- c.clear();
- for (size_t i = 0; i < data.size(); i++)
- {
- // 遍历计算所有的数据点,将其就近分配到对应的簇
- double minDis = calcDis(*(data[i]), clusters[0]);
- int minIndex_cluster = 0; // 离哪个簇质心距离最小的簇的索引
- for (size_t j = 1; j < clusters.size(); j++)
- {
- // j为从1开始 因为前面已经算过了
- double tmpDis = calcDis(*(data[i]), clusters[j]);
- if (tmpDis < minDis)
- minDis = tmpDis, minIndex_cluster = j;
- }
- // 以上的目的是:看当前的点 离哪个簇的质心 最近
- // 现在就知道你这个数据点 属于哪个簇了
- clusters[minIndex_cluster].addPoint(data[i]);
- }
-
- bool changed = false;
- for (auto &&c : clusters){
- // 每个簇更新各自的簇质心 看是否有改变
- changed = changed || c.updateCentroid();
- }
- std::cout << "第" << round << "轮迭代:" <<"簇质心是否有改变=" << changed << std::endl;
- // 簇质心没有改变了 就可以退出迭代了
- if (!changed)
- break;
- }
- return clusters;
- }
- };
复制代码
- # -*- coding: utf-8 -*-
- __author__ = u'东方耀 微信:dfy_88888'
- __date__ = '2022/3/16 下午5:21'
- __product__ = 'PyCharm'
- __filename__ = '14_kmeans聚类结果的可视化_for_c++'
- # 运行kmeans算法
- # 将结果(JSON化)输出到文件中
- # 使用Python读取文件内容
- # 使用pyplot可视化
- from mpl_toolkits.mplot3d import Axes3D
- import matplotlib.pyplot as plt
- import json
- import random
- colors = [
- "#ff0000", "#00ff00", "#0000ff", "#404040", "#ff00ff", "#00ffff", "#C0ff00", "#ffC000", "#ff00C0", "#000070",
- "#007000", "#700000",
- ]
- def paint(ax, xs, ys, color, zs=None, marker='.', s=30):
- if zs != None:
- # print("这是打印三维的")
- ax.scatter(xs=xs, ys=ys, zs=zs, zdir='z', c=color, marker=marker, s=s)
- else:
- ax.scatter(x=xs, y=ys, c=color, marker=marker, s=s)
- def readData():
- random.shuffle(colors)
- output_json_c_file = "/home/jiang/jjj_eigen_works/my_use_eigen_demos/build/kmeans_visualization_py.json"
- data = json.load(open(output_json_c_file, mode="r", encoding="utf-8"))
- dimension = data["dimension"]
- clusters = []
- clusterCnt = 0
- for tmpRawCluster in data["clusters"]:
- tmpCluster = {"centroid": None, "xss": [],
- "color": colors[clusterCnt % 140]}
- if "centroid" in tmpRawCluster:
- tmpCluster["centroid"] = tmpRawCluster["centroid"]
- for i in range(0, dimension):
- tmpCluster["xss"].append([])
- if "points" in tmpRawCluster:
- for tmpRawPoint in tmpRawCluster["points"]:
- for j in range(0, len(tmpRawPoint)):
- tmpCluster["xss"][j].append(tmpRawPoint[j])
- clusters.append(tmpCluster)
- clusterCnt += 1
- return {"dimension": dimension, "clusters": clusters}
- def work():
- data = readData()
- print("读入的数据:维度=%d, 类别k=%d" % (int(data["dimension"]), len(data["clusters"])))
- fig = plt.figure()
- if data["dimension"] == 2:
- ax = fig.add_subplot(111)
- for cluster in data["clusters"]:
- if cluster["centroid"]:
- paint(ax, cluster["xss"][0],
- cluster["xss"][1], cluster["color"], marker='o')
- # 画质心用大点
- paint(ax, [cluster["centroid"][0]], [
- cluster["centroid"][1]], "#000000", marker='^', s=150)
- elif data["dimension"] == 3:
- ax = fig.add_subplot(111, projection='3d')
- for cluster in data["clusters"]:
- paint(ax, cluster["xss"][0], cluster["xss"]
- [1], cluster["color"], cluster["xss"][2], marker='o')
- # 画质心用大点
- paint(ax, cluster["centroid"][0], cluster["centroid"]
- [1], "#000000", cluster["centroid"][2], marker='^', s=150)
- plt.show()
- pass
- if __name__ == "__main__":
- work()
复制代码
|
|