1
+ // simple implementation of the K-nearest-neighbors algorithm in C
2
+
3
+ #include <stdio.h>
4
+ #include <stdlib.h>
5
+ #include <string.h>
6
+ #include <math.h>
7
+
8
+ // allow a datapoint to have not more than 10 dimensions
9
+ #define MAX_COORDINATES 10
10
+
11
+ typedef struct DataPoint {
12
+ float coordinates [MAX_COORDINATES ];
13
+ int numDimensions ;
14
+ char * label ;
15
+ } DataPoint ;
16
+
17
+ typedef struct DistanceIndex {
18
+ float distance ;
19
+ int index ;
20
+ } DistanceIndex ;
21
+
22
+ DataPoint * createDataPoint (float * coordinates , int numCoordinates , const char * label ) {
23
+ if (numCoordinates > MAX_COORDINATES ) {
24
+ fprintf (stderr , "ERROR : num_coord for DataPoint is greater than MAX_COORDINATES (10)\n" );
25
+ return (DataPoint * ) NULL ;
26
+ }
27
+
28
+ DataPoint * dp = (DataPoint * ) malloc (sizeof (DataPoint ));
29
+ for (int i = 0 ; i < numCoordinates ; i ++ ) {
30
+ dp -> coordinates [i ] = coordinates [i ];
31
+ }
32
+ for (int i = numCoordinates ; i < MAX_COORDINATES ; i ++ ) {
33
+ dp -> coordinates [i ] = 0 ;
34
+ }
35
+
36
+ dp -> numDimensions = numCoordinates ;
37
+
38
+ dp -> label = (char * ) malloc (strlen (label ) + 1 );
39
+ strcpy (dp -> label , label );
40
+ return dp ;
41
+ }
42
+
43
+ void freeDataPoint (DataPoint * dp ) {
44
+ free (dp -> label );
45
+ free (dp );
46
+ printf ("deleted Datapoint\n" );
47
+ }
48
+
49
+ // compute euclidean distance between two datapoints of equal dimensions
50
+ float calculate_euclidean_distance (DataPoint p1 , DataPoint p2 ) {
51
+ if (p1 .numDimensions != p2 .numDimensions ) {
52
+ fprintf (stderr , "ERROR : given datapoints are of unequal dimensions (%d and %d)\n" , p1 .numDimensions , p2 .numDimensions );
53
+ return NAN ;
54
+ }
55
+
56
+ float distance = 0 ;
57
+
58
+ for (int i = 0 ; i < p1 .numDimensions ; i ++ ) {
59
+ float diff = (float ) (p1 .coordinates [i ] - p2 .coordinates [i ]);
60
+ distance += (diff * diff );
61
+ }
62
+ return sqrtf (distance );
63
+ }
64
+
65
+ float compute_manhattan_distance (DataPoint p1 , DataPoint p2 ) {
66
+ if (p1 .numDimensions != p2 .numDimensions ) {
67
+ fprintf (stderr , "ERROR : given datapoints are of unequal dimensions (%d and %d)\n" , p1 .numDimensions , p2 .numDimensions );
68
+ return NAN ;
69
+ }
70
+
71
+ float distance = 0 ;
72
+
73
+ for (int i = 0 ; i < p1 .numDimensions ; i ++ ) {
74
+ float diff = (float ) (p1 .coordinates [i ] - p2 .coordinates [i ]);
75
+ distance += abs (diff );
76
+ }
77
+ return distance ;
78
+ }
79
+
80
+ int compare (const void * a , const void * b ) {
81
+ float diff = ((DistanceIndex * )a )-> distance - ((DistanceIndex * )b )-> distance ;
82
+ return (diff > 0 ) - (diff < 0 );
83
+ }
84
+
85
+ // KNN implementation
86
+ const char * k_nearest_neighbors (DataPoint currentPoints [], int currentPointCount , DataPoint testPoint , int k ) {
87
+ // float *distances = (float*) malloc(currentPointCount * sizeof(float));
88
+ DistanceIndex * distances = (DistanceIndex * ) malloc (currentPointCount * sizeof (DistanceIndex ));
89
+
90
+ for (int i = 0 ; i < currentPointCount ; i ++ ) {
91
+ distances [i ].distance = calculate_euclidean_distance (currentPoints [i ], testPoint );
92
+ distances [i ].index = i ;
93
+ }
94
+
95
+ qsort (distances , currentPointCount , sizeof (DistanceIndex ), compare );
96
+
97
+ printf ("\n\n%d nearest neighbors for our testpoint\n" , k );
98
+ for (int i = 0 ; i < k ; i ++ ) {
99
+ printf ("point %d | distance : %f | label : %s\n" , distances [i ].index , distances [i ].distance , currentPoints [distances [i ].index ].label );
100
+ }
101
+
102
+ int labelCounts [MAX_COORDINATES ] = {0 };
103
+ const char * labels [MAX_COORDINATES ];
104
+ int labelSize = 0 ;
105
+
106
+ for (int i = 0 ; i < k ; i ++ ) {
107
+ const char * label = currentPoints [distances [i ].index ].label ;
108
+ int found = 0 ;
109
+
110
+ for (int j = 0 ; j < labelSize ; j ++ ) {
111
+ if (strcmp (labels [j ], label ) == 0 ) {
112
+ labelCounts [j ]++ ;
113
+ found = 1 ;
114
+ break ;
115
+ }
116
+ }
117
+ if (!found ) {
118
+ labels [labelSize ] = label ;
119
+ labelCounts [labelSize ] = 1 ;
120
+ labelSize ++ ;
121
+ }
122
+ }
123
+ int maxCount = 0 ;
124
+ const char * mostFreqLabel = NULL ;
125
+ for (int i = 0 ; i < labelSize ; i ++ ) {
126
+ if (labelCounts [i ] > maxCount ) {
127
+ maxCount = labelCounts [i ];
128
+ mostFreqLabel = labels [i ];
129
+ }
130
+ }
131
+
132
+ free (distances );
133
+ return mostFreqLabel ;
134
+ }
135
+
136
+ int main () {
137
+ int num_dimensions ;
138
+ printf ("enter the number of dimensions for all points : " );
139
+ scanf ("%d" , & num_dimensions );
140
+
141
+ int num_datapoints ;
142
+ printf ("enter the number of points to add in the dataset : " );
143
+ scanf ("%d" , & num_datapoints );
144
+
145
+ DataPoint train_data [num_datapoints ];
146
+
147
+ // take data for all datapoints
148
+ for (int i = 0 ; i < num_datapoints ; i ++ ) {
149
+ float coordinates [num_dimensions ];
150
+ printf ("enter the dimensions for point %d : " , i + 1 );
151
+ for (int j = 0 ; j < num_dimensions ; j ++ ) {
152
+ scanf ("%f" , & coordinates [j ]);
153
+ }
154
+
155
+ char label [100 ];
156
+ printf ("enter the label for point %d : " , i + 1 );
157
+ scanf ("%s" , label );
158
+
159
+ train_data [i ] = * createDataPoint (coordinates , num_dimensions , label );
160
+ }
161
+
162
+ // take data for the testpoint
163
+ DataPoint testPoint ;
164
+ printf ("enter coordinates for test point : " );
165
+ float testCoordinates [num_dimensions ];
166
+ for (int i = 0 ; i < num_dimensions ; i ++ ) {
167
+ scanf ("%f" , & testCoordinates [i ]);
168
+ }
169
+ testPoint = * createDataPoint (testCoordinates , num_dimensions , "test" );
170
+ // free(testCoordinates);
171
+
172
+ int k = 5 ;
173
+ const char * predicted_label = k_nearest_neighbors (train_data , num_datapoints , testPoint , k );
174
+ printf ("\n\npredicted label : %s\n" , predicted_label );
175
+
176
+ // freeDataPoint(&testPoint);
177
+ // for (int i = 0; i < num_datapoints; i++) {
178
+ // freeDataPoint(&train_data[i]);
179
+ // }
180
+ // free(train_data);
181
+ printf ("freed all memory - exiting\n" );
182
+
183
+ return 0 ;
184
+ }
0 commit comments