|
楼主 |
发表于 2015-4-8 00:06:19
|
显示全部楼层
本帖最后由 lightninng 于 2015-4-8 23:47 编辑
花了一天时间把自己的dbscan算法跑通,参数选好了,不是很复杂的东西,但是数据量真的太大了,之前是300w条,只拿出来了1w条跑,果然google的map-reduce是一个伟大的发明!!!真要是有时间想研究一下
- ESP = 5 # esp的默认值
- def dbscan(esp=0, minpts=4):
- import pickle
- import os
- import math
- import matplotlib.pyplot as plt
- if not os.path.exists(r"F:\data_storage\distance.txt"): # distance.txt存放距离矩阵,字典形式{点名: {点名: 两点之间的距离}}
- print("距离矩阵不存在,重新建立距离矩阵")
- def o_dis(a, b):
- # 计算欧式距离
- return math.sqrt((a[0]-b[0])**2 + (a[1]-b[1])**2)
- with open(r"org_dict.txt", "rb") as fp: # org_dict .txt存放数据点坐标,字典形式{点名: (横坐标,纵坐标)}
- org_dict = pickle.load(fp)
- n = len(org_dict)
- print(n)
- distance = {}
- with open("F:\data_storage\distance.txt", "wb") as fp_w:
- i = 0
- for each_key in org_dict.keys():
- temp_dict = {}
- for each_item in org_dict.items():
- temp_dict[each_item[0]] = o_dis(org_dict[each_key], each_item[1])
- distance[each_key] = temp_dict
- i += 1
- if i % 1000 == 0:
- print("距离矩阵已更新{}行".format(i))
- pickle.dump(distance, fp_w)
- # 如输入参数esp未给定则重新计算esp
- if esp == 0:
- print("开始计算esp")
- esp_set = []
- with open("distance.txt", "rb") as fp:
- distance = pickle.load(fp)
- i = 0
- n = len(distance)
- for each_key in distance.keys():
- i += 1
- esp_set.append(sorted(list(distance[each_key].values()))[minpts])
- print("已经计算完{}个esp值".format(i))
- # 绘制dsp-k4图
- esp_set.sort(reverse=True)
- x = range(n)
- y = esp_set
- esp = esp_set[int(len(esp_set)*esp_rate)]
- plt.plot(x, y, "o")
- plt.xlabel("x")
- plt.ylabel("esp-k4")
- plt.show()
- print("esp已设定为{}".format(esp))
- print("minpts已设定为{}".format(minpts))
- with open("F:\data_storage\distance.txt", "rb") as fp:
- distance = pickle.load(fp)
- print("org_dict读取完毕")
- with open(r"F:\data_storage\org_dict.txt", "rb") as fp:
- org_dict = pickle.load(fp)
- print("distance读取完毕")
- """
- 标记所有对象为 unvisited
- while True:
- 随机选择一个 unvisited 对象 p
- 标记 p 为 visited
- if p 的ε -邻域至少有MinPts个对象
- 创建一个新簇C,并把 p 添加到C
- 令 N 为 p 的ε -邻域中的对象集合
- for N 中每个点p'
- if p' 是 unvisited
- 标记 p' 为 visited
- if p' 的 ε -邻域至少有 MinPts 个点,把这些点添加到 N
- if p'还不是任何簇的成员,把p'添加到 C
- 输出C;
- else 标记 p 为噪声;
- if 没有标记为 unvisited 的对象:
- break
- """
- # 标记所有对象为 unvisited
- unvisited = set()
- for each_key in org_dict.keys():
- unvisited.add(each_key)
- print("所有对象已标记未遍历")
- result = {}
- i = 0
- k = 0
- while unvisited:
- p_key = unvisited.pop()
- p_set = set([key for key, value in distance[p_key].items() if value <= esp])
- if len(p_set) >= minpts+1:
- print("初始p_set为{}".format(p_set))
- i += 1
- j = 0
- result[p_key] = i
- print("第{0}类聚类开始,已经检测出{1}个噪点".format(i, k))
- while p_set:
- p1_key = p_set.pop()
- if p1_key in unvisited:
- unvisited.remove(p1_key)
- p1_value = i
- result[p1_key] = p1_value
- j += 1
- print("聚类{0}已经有{1}个元素".format(i, j))
- p1_set = set([key for key, value in distance[p1_key].items() if value <= esp])
- if len(p1_set) >= minpts+1:
- for each in p1_set:
- if each in unvisited:
- p1_set.add(each)
- print("聚类{}已创建完毕".format(i))
- else:
- result[p_key] = 0
- k += 1
- if k % 100 == 0:
- print("已经检测出{}个噪点".format(k))
- output = {}
- for pos, cluster in [(tuple(org_dict[key]), result[key]) for key in result.keys()]:
- if cluster not in output:
- output[cluster] = [pos]
- else:
- output[cluster].append(pos)
- color = {0: (0, 0, 0, 1),
- 1: (250/255, 240/255, 230/255, 1),
- 2: (1, 0, 0, 1),
- 3: (1, 1, 0, 1),
- 4: (156/255, 102/255, 31/255, 1),
- 5: (1, 125/255, 64/255, 1),
- 6: (1, 0, 1, 1),
- 7: (100/255, 0, 0, 1),
- 8: (0, 1, 0, 1),
- 9: (0, 0, 1, 1),
- 10: (192/255, 192/255, 192/255, 1),
- }
- for cluster in output:
- if cluster < 10:
- for each in output[cluster]:
- plt.plot(each[0], each[1], 'o', color=color[cluster])
- else:
- print("聚类{}未绘制".format(cluster))
- print("聚类完成,得到聚类{0}个,检测出{1}个噪点".format(i, k))
- plt.show()
复制代码
|
|