| New file |
| | |
| | | //usr/bin/cc -Ofast -lm "${0}" -o "${0%.c}" && ./"${0%.c}" "$@"; s=$?; rm ./"${0%.c}"; exit $s |
| | | |
| | | #include <math.h> |
| | | #include <stdio.h> |
| | | #include <stdlib.h> |
| | | #include <string.h> |
| | | #include <time.h> |
| | | |
| | | typedef struct matrix{ |
| | | int rows, cols; |
| | | double **vals; |
| | | } matrix; |
| | | |
| | | matrix csv_to_matrix(char *filename, int header); |
| | | matrix make_matrix(int rows, int cols); |
| | | void zero_matrix(matrix m); |
| | | |
| | | void copy(double *x, double *y, int n); |
| | | double dist(double *x, double *y, int n); |
| | | int *sample(int n); |
| | | |
| | | int find_int_arg(int argc, char **argv, char *arg, int def); |
| | | int find_arg(int argc, char* argv[], char *arg); |
| | | |
| | | int closest_center(double *datum, matrix centers) |
| | | { |
| | | int j; |
| | | int best = 0; |
| | | double best_dist = dist(datum, centers.vals[best], centers.cols); |
| | | for(j = 0; j < centers.rows; ++j){ |
| | | double new_dist = dist(datum, centers.vals[j], centers.cols); |
| | | if(new_dist < best_dist){ |
| | | best_dist = new_dist; |
| | | best = j; |
| | | } |
| | | } |
| | | return best; |
| | | } |
| | | |
| | | double dist_to_closest_center(double *datum, matrix centers) |
| | | { |
| | | int ci = closest_center(datum, centers); |
| | | return dist(datum, centers.vals[ci], centers.cols); |
| | | } |
| | | |
| | | int kmeans_expectation(matrix data, int *assignments, matrix centers) |
| | | { |
| | | int i; |
| | | int converged = 1; |
| | | for(i = 0; i < data.rows; ++i){ |
| | | int closest = closest_center(data.vals[i], centers); |
| | | if(closest != assignments[i]) converged = 0; |
| | | assignments[i] = closest; |
| | | } |
| | | return converged; |
| | | } |
| | | |
| | | void kmeans_maximization(matrix data, int *assignments, matrix centers) |
| | | { |
| | | int i,j; |
| | | int *counts = calloc(centers.rows, sizeof(int)); |
| | | zero_matrix(centers); |
| | | for(i = 0; i < data.rows; ++i){ |
| | | ++counts[assignments[i]]; |
| | | for(j = 0; j < data.cols; ++j){ |
| | | centers.vals[assignments[i]][j] += data.vals[i][j]; |
| | | } |
| | | } |
| | | for(i = 0; i < centers.rows; ++i){ |
| | | if(counts[i]){ |
| | | for(j = 0; j < centers.cols; ++j){ |
| | | centers.vals[i][j] /= counts[i]; |
| | | } |
| | | } |
| | | } |
| | | } |
| | | |
| | | double WCSS(matrix data, int *assignments, matrix centers) |
| | | { |
| | | int i, j; |
| | | double sum = 0; |
| | | |
| | | for(i = 0; i < data.rows; ++i){ |
| | | int ci = assignments[i]; |
| | | sum += (1 - dist(data.vals[i], centers.vals[ci], data.cols)); |
| | | } |
| | | return sum / data.rows; |
| | | } |
| | | |
| | | typedef struct{ |
| | | int *assignments; |
| | | matrix centers; |
| | | } model; |
| | | |
| | | void smart_centers(matrix data, matrix centers) { |
| | | int i,j; |
| | | copy(data.vals[rand()%data.rows], centers.vals[0], data.cols); |
| | | double *weights = calloc(data.rows, sizeof(double)); |
| | | int clusters = centers.rows; |
| | | for (i = 1; i < clusters; ++i) { |
| | | double sum = 0; |
| | | centers.rows = i; |
| | | for (j = 0; j < data.rows; ++j) { |
| | | weights[j] = dist_to_closest_center(data.vals[j], centers); |
| | | sum += weights[j]; |
| | | } |
| | | double r = sum*((double)rand()/RAND_MAX); |
| | | for (j = 0; j < data.rows; ++j) { |
| | | r -= weights[j]; |
| | | if(r <= 0){ |
| | | copy(data.vals[j], centers.vals[i], data.cols); |
| | | break; |
| | | } |
| | | } |
| | | } |
| | | free(weights); |
| | | } |
| | | |
| | | void random_centers(matrix data, matrix centers){ |
| | | int i; |
| | | int *s = sample(data.rows); |
| | | for(i = 0; i < centers.rows; ++i){ |
| | | copy(data.vals[s[i]], centers.vals[i], data.cols); |
| | | } |
| | | free(s); |
| | | } |
| | | |
| | | model do_kmeans(matrix data, int k) |
| | | { |
| | | matrix centers = make_matrix(k, data.cols); |
| | | int *assignments = calloc(data.rows, sizeof(int)); |
| | | smart_centers(data, centers); |
| | | //random_centers(data, centers); |
| | | if(k == 1) kmeans_maximization(data, assignments, centers); |
| | | while(!kmeans_expectation(data, assignments, centers)){ |
| | | kmeans_maximization(data, assignments, centers); |
| | | } |
| | | model m; |
| | | m.assignments = assignments; |
| | | m.centers = centers; |
| | | return m; |
| | | } |
| | | |
| | | int main(int argc, char *argv[]) |
| | | { |
| | | if(argc < 3){ |
| | | fprintf(stderr, "usage: %s <csv-file> [points/centers/stats]\n", argv[0]); |
| | | return 0; |
| | | } |
| | | int i,j; |
| | | srand(time(0)); |
| | | matrix data = csv_to_matrix(argv[1], 0); |
| | | int k = find_int_arg(argc, argv, "-k", 2); |
| | | int header = find_arg(argc, argv, "-h"); |
| | | int count = find_arg(argc, argv, "-c"); |
| | | |
| | | if(strcmp(argv[2], "assignments")==0){ |
| | | model m = do_kmeans(data, k); |
| | | int *assignments = m.assignments; |
| | | for(i = 0; i < k; ++i){ |
| | | if(i != 0) printf("-\n"); |
| | | for(j = 0; j < data.rows; ++j){ |
| | | if(!(assignments[j] == i)) continue; |
| | | printf("%f, %f\n", data.vals[j][0], data.vals[j][1]); |
| | | } |
| | | } |
| | | }else if(strcmp(argv[2], "centers")==0){ |
| | | model m = do_kmeans(data, k); |
| | | printf("WCSS: %f\n", WCSS(data, m.assignments, m.centers)); |
| | | int *counts = 0; |
| | | if(count){ |
| | | counts = calloc(k, sizeof(int)); |
| | | for(j = 0; j < data.rows; ++j){ |
| | | ++counts[m.assignments[j]]; |
| | | } |
| | | } |
| | | for(j = 0; j < m.centers.rows; ++j){ |
| | | if(count) printf("%d, ", counts[j]); |
| | | printf("%f, %f\n", m.centers.vals[j][0], m.centers.vals[j][1]); |
| | | } |
| | | }else if(strcmp(argv[2], "scan")==0){ |
| | | for(i = 1; i <= k; ++i){ |
| | | model m = do_kmeans(data, i); |
| | | printf("%f\n", WCSS(data, m.assignments, m.centers)); |
| | | } |
| | | } |
| | | return 0; |
| | | } |
| | | |
| | | // Utility functions |
| | | |
| | | int *sample(int n) |
| | | { |
| | | int i; |
| | | int *s = calloc(n, sizeof(int)); |
| | | for(i = 0; i < n; ++i) s[i] = i; |
| | | for(i = n-1; i >= 0; --i){ |
| | | int swap = s[i]; |
| | | int index = rand()%(i+1); |
| | | s[i] = s[index]; |
| | | s[index] = swap; |
| | | } |
| | | return s; |
| | | } |
| | | |
| | | double dist(double *x, double *y, int n) |
| | | { |
| | | int i; |
| | | double mw = (x[0] < y[0]) ? x[0] : y[0]; |
| | | double mh = (x[1] < y[1]) ? x[1] : y[1]; |
| | | double inter = mw*mh; |
| | | double sum = x[0]*x[1] + y[0]*y[1]; |
| | | double un = sum - inter; |
| | | double iou = inter/un; |
| | | return 1-iou; |
| | | } |
| | | |
| | | void copy(double *x, double *y, int n) |
| | | { |
| | | int i; |
| | | for(i = 0; i < n; ++i) y[i] = x[i]; |
| | | } |
| | | |
| | | void error(char *s){ |
| | | fprintf(stderr, "Error: %s\n", s); |
| | | exit(0); |
| | | } |
| | | |
| | | char *fgetl(FILE *fp) |
| | | { |
| | | if(feof(fp)) return 0; |
| | | int size = 512; |
| | | char *line = malloc(size*sizeof(char)); |
| | | if(!fgets(line, size, fp)){ |
| | | free(line); |
| | | return 0; |
| | | } |
| | | |
| | | int curr = strlen(line); |
| | | |
| | | while(line[curr-1]!='\n'){ |
| | | size *= 2; |
| | | line = realloc(line, size*sizeof(char)); |
| | | if(!line) error("Malloc"); |
| | | fgets(&line[curr], size-curr, fp); |
| | | curr = strlen(line); |
| | | } |
| | | line[curr-1] = '\0'; |
| | | |
| | | return line; |
| | | } |
| | | |
| | | // Matrix stuff |
| | | |
| | | int count_fields(char *line) |
| | | { |
| | | int count = 0; |
| | | int done = 0; |
| | | char *c; |
| | | for(c = line; !done; ++c){ |
| | | done = (*c == '\0'); |
| | | if(*c == ',' || done) ++count; |
| | | } |
| | | return count; |
| | | } |
| | | |
| | | double *parse_fields(char *l, int n) |
| | | { |
| | | int i; |
| | | double *field = calloc(n, sizeof(double)); |
| | | for(i = 0; i < n; ++i){ |
| | | field[i] = atof(l); |
| | | l = strchr(l, ',')+1; |
| | | } |
| | | return field; |
| | | } |
| | | |
| | | matrix make_matrix(int rows, int cols) |
| | | { |
| | | matrix m; |
| | | m.rows = rows; |
| | | m.cols = cols; |
| | | m.vals = calloc(m.rows, sizeof(double *)); |
| | | int i; |
| | | for(i = 0; i < m.rows; ++i) m.vals[i] = calloc(m.cols, sizeof(double)); |
| | | return m; |
| | | } |
| | | |
| | | void zero_matrix(matrix m) |
| | | { |
| | | int i, j; |
| | | for(i = 0; i < m.rows; ++i){ |
| | | for(j = 0; j < m.cols; ++j) m.vals[i][j] = 0; |
| | | } |
| | | } |
| | | |
| | | matrix csv_to_matrix(char *filename, int header) |
| | | { |
| | | FILE *fp = fopen(filename, "r"); |
| | | if(!fp) error(filename); |
| | | |
| | | matrix m; |
| | | m.cols = -1; |
| | | |
| | | char *line; |
| | | |
| | | int n = 0; |
| | | int size = 1024; |
| | | m.vals = calloc(size, sizeof(double*)); |
| | | if(header) fgetl(fp); |
| | | while((line = fgetl(fp))){ |
| | | if(m.cols == -1) m.cols = count_fields(line); |
| | | if(n == size){ |
| | | size *= 2; |
| | | m.vals = realloc(m.vals, size*sizeof(double*)); |
| | | } |
| | | m.vals[n] = parse_fields(line, m.cols); |
| | | free(line); |
| | | ++n; |
| | | } |
| | | m.vals = realloc(m.vals, n*sizeof(double*)); |
| | | m.rows = n; |
| | | return m; |
| | | } |
| | | |
| | | // Arguement parsing |
| | | |
| | | void del_arg(int argc, char **argv, int index) |
| | | { |
| | | int i; |
| | | for(i = index; i < argc-1; ++i) argv[i] = argv[i+1]; |
| | | argv[i] = 0; |
| | | } |
| | | |
| | | int find_arg(int argc, char* argv[], char *arg) |
| | | { |
| | | int i; |
| | | for(i = 0; i < argc; ++i) { |
| | | if(!argv[i]) continue; |
| | | if(0==strcmp(argv[i], arg)) { |
| | | del_arg(argc, argv, i); |
| | | return 1; |
| | | } |
| | | } |
| | | return 0; |
| | | } |
| | | |
| | | int find_int_arg(int argc, char **argv, char *arg, int def) |
| | | { |
| | | int i; |
| | | for(i = 0; i < argc-1; ++i){ |
| | | if(!argv[i]) continue; |
| | | if(0==strcmp(argv[i], arg)){ |
| | | def = atoi(argv[i+1]); |
| | | del_arg(argc, argv, i); |
| | | del_arg(argc, argv, i); |
| | | break; |
| | | } |
| | | } |
| | | return def; |
| | | } |
| | | |
| | | float find_float_arg(int argc, char **argv, char *arg, float def) |
| | | { |
| | | int i; |
| | | for(i = 0; i < argc-1; ++i){ |
| | | if(!argv[i]) continue; |
| | | if(0==strcmp(argv[i], arg)){ |
| | | def = atof(argv[i+1]); |
| | | del_arg(argc, argv, i); |
| | | del_arg(argc, argv, i); |
| | | break; |
| | | } |
| | | } |
| | | return def; |
| | | } |
| | | |
| | | char *find_char_arg(int argc, char **argv, char *arg, char *def) |
| | | { |
| | | int i; |
| | | for(i = 0; i < argc-1; ++i){ |
| | | if(!argv[i]) continue; |
| | | if(0==strcmp(argv[i], arg)){ |
| | | def = argv[i+1]; |
| | | del_arg(argc, argv, i); |
| | | del_arg(argc, argv, i); |
| | | break; |
| | | } |
| | | } |
| | | return def; |
| | | } |
| | | |