-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAssignment2.py
50 lines (38 loc) · 1.6 KB
/
Assignment2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
from pyspark import SparkContext
import math
import time
start_time=time.time()
sc = SparkContext()
infinite_val=500
Dataset=sc.textFile("ass2-eda-18.txt")
result_file_ptr = open('ass2-eda-result.txt', 'w')
result_file_ptr.write("c0\tc1\tc2\tc3\tc4\n")
test_instance=[t.split() for t in Dataset.toLocalIterator()]
for t in test_instance:
def class_Edist_map(line): # for each test instance, calculating the distance from remaining instances in Dataset
line=line.split()
l=len(line)-1
class_label=line[l]
feature=line[:-1]
dist=0
if line==t: # Excluding test instance from Dataset
return(class_label,infinite_val)
for i,j in zip(feature,t[:-1]):
dist += math.pow((float(i)-float(j)),2)
Edist=math.sqrt(dist)
return(class_label, Edist)
dataset_map=Dataset.map(class_Edist_map)
sorted_neighbour=dataset_map.takeOrdered(50, key=lambda x: x[1])
result =sc.parallelize(map(lambda x: (x[0],1),sorted_neighbour)).reduceByKey(lambda x, y: (x + y))
if(result.count()<5):
result=[(k,v) for k,v in result.toLocalIterator()]
class_label=[int(k) for k,v in result]
for i in range(0,5):
if i not in class_label:
result.append((unicode(i),0))
result=sc.parallelize(result)
class_prop=sc.parallelize(result.takeOrdered(5, key=lambda x: int(x[0]))).map(lambda x: float(x[1])/float(50))
for item in class_prop.toLocalIterator():
result_file_ptr.write("%s\t" % item)
result_file_ptr.write("\n")
print("------%s seconds --------------------------------------------" % (time.time() - start_time))