算法实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import numpy as np
import operator

def createDataSet():
group = np.array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])
labels = ['A', 'A', 'B', 'B']
return group, labels

def classify0(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0]
diffMat = np.tile(inX, (dataSetSize, 1))-dataSet
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances**0.5
sortedDistIndicies = distances.argsort()
classCount = {}
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel, 0)+1
sortedClassCount = sorted(classCount,
key=lambda x: classCount[x], reverse=True)
return sortedClassCount[0][0]

正向最大匹配法(Maximum Match Method,MM 法)是指从左向右按最大原则与词典里面的词进行匹配。假设词典中最长词是 m个字,那么从待切分文本的最左边取 m个字符与词典进行匹配,如果匹配成功,则分词, 如果匹配不成功,那么取 m−1 个字符与词典匹配,直到成功匹配为止。