Skip to content

Commit ff3a7f0

Browse files
committed
first commit
0 parents  commit ff3a7f0

File tree

4 files changed

+282
-0
lines changed

4 files changed

+282
-0
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
*.exe

README.md

+43
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
a basic implementation of the k nearest neighbors (knn) algorithm in C. Why? i dont know
2+
3+
### usage
4+
you can compile the code on your machine using the following command
5+
6+
```cmd
7+
gcc knn.c -o knn -lm
8+
```
9+
10+
you can enter the data manually but i prefer using the command line to enter the data as well in the following format
11+
12+
```plaintext
13+
<number_of_dimensions>
14+
<number_of_points_in_dataset>
15+
<dataset>
16+
17+
<coordinates for test point>
18+
```
19+
20+
this format can also be seen in the [input.txt](input.txt) file as well
21+
22+
this input file can be run as follows on a windows machine
23+
24+
```cmd
25+
knn.exe < input.txt
26+
```
27+
28+
and the output would look as follows
29+
30+
```plaintext
31+
5 nearest neighbors for our testpoint
32+
point 34 | distance : 0.200000 | label : Setosa
33+
point 32 | distance : 0.244949 | label : Setosa
34+
point 4 | distance : 0.264575 | label : Setosa
35+
point 9 | distance : 0.387298 | label : Setosa
36+
point 35 | distance : 0.435890 | label : Setosa
37+
38+
39+
predicted label : Setosa
40+
freed all memory - exiting
41+
```
42+
43+
---

input.txt

+54
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
4
2+
50
3+
5.1 3.5 1.4 0.2 Setosa
4+
4.9 3.0 1.4 0.2 Setosa
5+
4.7 3.2 1.3 0.2 Setosa
6+
5.0 3.6 1.4 0.2 Setosa
7+
5.4 3.9 1.7 0.4 Setosa
8+
4.6 3.4 1.4 0.3 Setosa
9+
5.0 3.4 1.5 0.2 Setosa
10+
4.4 2.9 1.4 0.2 Setosa
11+
4.9 3.1 1.5 0.1 Setosa
12+
5.4 3.7 1.5 0.2 Setosa
13+
7.0 3.2 4.7 1.4 Versicolor
14+
6.4 3.2 4.5 1.5 Versicolor
15+
6.9 3.1 4.9 1.5 Versicolor
16+
5.5 2.3 4.0 1.3 Versicolor
17+
6.5 2.8 4.6 1.5 Versicolor
18+
5.7 2.8 4.5 1.3 Versicolor
19+
6.3 3.3 4.7 1.6 Versicolor
20+
4.9 2.4 3.3 1.0 Versicolor
21+
6.6 2.9 4.6 1.3 Versicolor
22+
5.2 2.7 3.9 1.4 Versicolor
23+
6.3 3.3 6.0 2.5 Virginica
24+
5.8 2.7 5.1 1.9 Virginica
25+
7.1 3.0 5.9 2.1 Virginica
26+
6.3 2.9 5.6 1.8 Virginica
27+
6.5 3.0 5.8 2.2 Virginica
28+
7.6 3.0 6.6 2.1 Virginica
29+
4.9 2.5 4.5 1.7 Virginica
30+
7.3 2.9 6.3 1.8 Virginica
31+
6.7 2.5 5.8 1.8 Virginica
32+
7.2 3.6 6.1 2.5 Virginica
33+
5.1 3.4 1.5 0.2 Setosa
34+
5.7 3.8 1.7 0.3 Setosa
35+
5.1 3.8 1.5 0.3 Setosa
36+
5.4 3.4 1.7 0.2 Setosa
37+
5.1 3.7 1.5 0.4 Setosa
38+
5.2 3.5 1.5 0.2 Setosa
39+
5.1 3.3 1.7 0.5 Setosa
40+
4.8 3.4 1.9 0.2 Setosa
41+
5.0 3.0 1.6 0.2 Setosa
42+
5.0 3.4 1.6 0.4 Setosa
43+
6.0 2.2 4.0 1.0 Versicolor
44+
6.1 2.8 4.7 1.2 Versicolor
45+
6.2 2.2 4.5 1.5 Versicolor
46+
5.5 2.3 4.0 1.3 Versicolor
47+
5.5 2.6 4.4 1.2 Versicolor
48+
6.1 3.0 4.6 1.4 Versicolor
49+
5.8 2.7 4.1 1.0 Versicolor
50+
5.0 2.0 3.5 1.0 Versicolor
51+
5.6 2.5 3.9 1.1 Versicolor
52+
5.7 2.8 4.5 1.3 Versicolor
53+
54+
5.2 3.8 1.6 0.5

knn.c

+184
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
// simple implementation of the K-nearest-neighbors algorithm in C
2+
3+
#include <stdio.h>
4+
#include <stdlib.h>
5+
#include <string.h>
6+
#include <math.h>
7+
8+
// allow a datapoint to have not more than 10 dimensions
9+
#define MAX_COORDINATES 10
10+
11+
typedef struct DataPoint {
12+
float coordinates[MAX_COORDINATES];
13+
int numDimensions;
14+
char* label;
15+
} DataPoint;
16+
17+
typedef struct DistanceIndex {
18+
float distance;
19+
int index;
20+
} DistanceIndex;
21+
22+
DataPoint* createDataPoint(float *coordinates, int numCoordinates, const char* label) {
23+
if (numCoordinates > MAX_COORDINATES) {
24+
fprintf(stderr, "ERROR : num_coord for DataPoint is greater than MAX_COORDINATES (10)\n");
25+
return (DataPoint*) NULL;
26+
}
27+
28+
DataPoint* dp = (DataPoint*) malloc(sizeof(DataPoint));
29+
for (int i = 0; i < numCoordinates; i++) {
30+
dp->coordinates[i] = coordinates[i];
31+
}
32+
for (int i = numCoordinates; i < MAX_COORDINATES; i++) {
33+
dp->coordinates[i] = 0;
34+
}
35+
36+
dp->numDimensions = numCoordinates;
37+
38+
dp->label = (char*) malloc(strlen(label) + 1);
39+
strcpy(dp->label, label);
40+
return dp;
41+
}
42+
43+
void freeDataPoint(DataPoint* dp) {
44+
free(dp->label);
45+
free(dp);
46+
printf("deleted Datapoint\n");
47+
}
48+
49+
// compute euclidean distance between two datapoints of equal dimensions
50+
float calculate_euclidean_distance(DataPoint p1, DataPoint p2) {
51+
if (p1.numDimensions != p2.numDimensions) {
52+
fprintf(stderr, "ERROR : given datapoints are of unequal dimensions (%d and %d)\n", p1.numDimensions, p2.numDimensions);
53+
return NAN;
54+
}
55+
56+
float distance = 0;
57+
58+
for (int i = 0; i < p1.numDimensions; i++) {
59+
float diff = (float) (p1.coordinates[i] - p2.coordinates[i]);
60+
distance += (diff * diff);
61+
}
62+
return sqrtf(distance);
63+
}
64+
65+
float compute_manhattan_distance(DataPoint p1, DataPoint p2) {
66+
if (p1.numDimensions != p2.numDimensions) {
67+
fprintf(stderr, "ERROR : given datapoints are of unequal dimensions (%d and %d)\n", p1.numDimensions, p2.numDimensions);
68+
return NAN;
69+
}
70+
71+
float distance = 0;
72+
73+
for (int i = 0; i < p1.numDimensions; i++) {
74+
float diff = (float) (p1.coordinates[i] - p2.coordinates[i]);
75+
distance += abs(diff);
76+
}
77+
return distance;
78+
}
79+
80+
int compare(const void* a, const void* b) {
81+
float diff = ((DistanceIndex*)a)->distance - ((DistanceIndex*)b)->distance;
82+
return (diff > 0) - (diff < 0);
83+
}
84+
85+
// KNN implementation
86+
const char* k_nearest_neighbors(DataPoint currentPoints[], int currentPointCount, DataPoint testPoint, int k) {
87+
// float *distances = (float*) malloc(currentPointCount * sizeof(float));
88+
DistanceIndex* distances = (DistanceIndex*) malloc(currentPointCount * sizeof(DistanceIndex));
89+
90+
for (int i = 0; i < currentPointCount; i++) {
91+
distances[i].distance = calculate_euclidean_distance(currentPoints[i], testPoint);
92+
distances[i].index = i;
93+
}
94+
95+
qsort(distances, currentPointCount, sizeof(DistanceIndex), compare);
96+
97+
printf("\n\n%d nearest neighbors for our testpoint\n", k);
98+
for (int i = 0; i < k; i++) {
99+
printf("point %d | distance : %f | label : %s\n", distances[i].index, distances[i].distance, currentPoints[distances[i].index].label);
100+
}
101+
102+
int labelCounts[MAX_COORDINATES] = {0};
103+
const char* labels[MAX_COORDINATES];
104+
int labelSize = 0;
105+
106+
for (int i = 0; i < k; i++) {
107+
const char* label = currentPoints[distances[i].index].label;
108+
int found = 0;
109+
110+
for (int j = 0; j < labelSize; j++) {
111+
if (strcmp(labels[j], label) == 0) {
112+
labelCounts[j]++;
113+
found = 1;
114+
break;
115+
}
116+
}
117+
if (!found) {
118+
labels[labelSize] = label;
119+
labelCounts[labelSize] = 1;
120+
labelSize++;
121+
}
122+
}
123+
int maxCount = 0;
124+
const char* mostFreqLabel = NULL;
125+
for (int i = 0; i < labelSize; i++) {
126+
if (labelCounts[i] > maxCount) {
127+
maxCount = labelCounts[i];
128+
mostFreqLabel = labels[i];
129+
}
130+
}
131+
132+
free(distances);
133+
return mostFreqLabel;
134+
}
135+
136+
int main() {
137+
int num_dimensions;
138+
printf("enter the number of dimensions for all points : ");
139+
scanf("%d", &num_dimensions);
140+
141+
int num_datapoints;
142+
printf("enter the number of points to add in the dataset : ");
143+
scanf("%d", &num_datapoints);
144+
145+
DataPoint train_data[num_datapoints];
146+
147+
// take data for all datapoints
148+
for (int i = 0; i < num_datapoints; i++) {
149+
float coordinates[num_dimensions];
150+
printf("enter the dimensions for point %d : ", i + 1);
151+
for (int j = 0; j < num_dimensions; j++) {
152+
scanf("%f", &coordinates[j]);
153+
}
154+
155+
char label[100];
156+
printf("enter the label for point %d : ", i + 1);
157+
scanf("%s", label);
158+
159+
train_data[i] = *createDataPoint(coordinates, num_dimensions, label);
160+
}
161+
162+
// take data for the testpoint
163+
DataPoint testPoint;
164+
printf("enter coordinates for test point : ");
165+
float testCoordinates[num_dimensions];
166+
for (int i = 0; i < num_dimensions; i++) {
167+
scanf("%f", &testCoordinates[i]);
168+
}
169+
testPoint = *createDataPoint(testCoordinates, num_dimensions, "test");
170+
// free(testCoordinates);
171+
172+
int k = 5;
173+
const char* predicted_label = k_nearest_neighbors(train_data, num_datapoints, testPoint, k);
174+
printf("\n\npredicted label : %s\n", predicted_label);
175+
176+
// freeDataPoint(&testPoint);
177+
// for (int i = 0; i < num_datapoints; i++) {
178+
// freeDataPoint(&train_data[i]);
179+
// }
180+
// free(train_data);
181+
printf("freed all memory - exiting\n");
182+
183+
return 0;
184+
}

0 commit comments

Comments
 (0)