huff.c 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371
  1. #include <stdio.h>
  2. #include <stdlib.h>
  3. #include "huffman.h"
  4. // count the weights of characters in the file filename
  5. void count_weights(char *filename, int *weights){
  6. // store last character read
  7. char ch = 0;
  8. // open file for reading
  9. FILE * fptr = NULL;
  10. fptr = fopen(filename, "r");
  11. if (fptr == NULL){ // check for error
  12. printf("File name error.\n");
  13. return;
  14. }
  15. // count frequency of each ascii character
  16. while((ch = fgetc(fptr)) != EOF){
  17. weights[(int)ch]++;
  18. }
  19. }
  20. int build_list(Huff_Node **head, int *weights){
  21. Huff_Node *node = NULL;
  22. int weight_total = 0;
  23. int i = 0;
  24. for(i = 0; i < ASCII_SIZE; i++){
  25. weight_total = weight_total + weights[i];
  26. if(weights[i] > 0){
  27. node = create_huff_node(i, weights[i]);
  28. add_list_node(head, node);
  29. }
  30. }
  31. return weight_total;
  32. }
  33. // creates a new node
  34. Huff_Node * create_huff_node(char ascii, int weight){
  35. Huff_Node *node = malloc(sizeof(Huff_Node));
  36. node->ascii = ascii;
  37. node->weight = weight;
  38. node->next = NULL;
  39. node->left = NULL;
  40. node->right = NULL;
  41. return node;
  42. }
  43. // adds node to list in ascending order by weight
  44. void add_list_node(Huff_Node **headp, Huff_Node *node){
  45. Huff_Node *cur = NULL;
  46. if(*headp == NULL || node->weight >= (*headp)->weight){
  47. node->next = *headp;
  48. *headp = node;
  49. }
  50. else{
  51. cur = *headp;
  52. while(cur->next != NULL && cur->next->weight > node->weight){
  53. cur = cur->next;
  54. }
  55. node->next = cur->next;
  56. cur->next = node;
  57. }
  58. }
  59. // gets the first element from the list
  60. Huff_Node * remove_min_node(Huff_Node *head){
  61. Huff_Node *node = head;
  62. Huff_Node *retval = head;
  63. while(node->next != NULL){
  64. retval = node;
  65. node = node->next;
  66. }
  67. retval->next = NULL;
  68. retval = node;
  69. return retval;
  70. }
  71. // prints the list values in order
  72. void print_list(Huff_Node *head){
  73. Huff_Node *cur = head;
  74. while(cur != NULL){
  75. printf("ascii: %c\tweight: %d\n", cur->ascii, cur->weight);
  76. cur = cur->next;
  77. }
  78. }
  79. // generate binary tree with weighted parent nodes
  80. Huff_Node * gen_huff_tree(Huff_Node *head, int weight_total){
  81. Huff_Node *parent = NULL;
  82. while(head->weight < weight_total){
  83. parent = create_huff_node(NO_CHAR, 0);
  84. parent->left = remove_min_node(head);
  85. parent->right = remove_min_node(head);
  86. parent->weight = parent->left->weight + parent->right->weight;
  87. add_list_node(&head, parent);
  88. }
  89. return head;
  90. }
  91. // prints the tree values in-order
  92. void print_tree(Huff_Node * root){
  93. if(root == NULL){
  94. return;
  95. }
  96. print_tree(root->left);
  97. if(root->ascii >= 0){
  98. printf("ascii: %c\tweight: %d\n", root->ascii, root->weight);
  99. }
  100. print_tree(root->right);
  101. }
  102. // calculates the height of the left and right subtrees and returns the larger value (to get the total height)
  103. int get_tree_height(Huff_Node *root){
  104. return get_tree_height_rh(root, 0);
  105. }
  106. // recursive helper function for get_tree_height
  107. int get_tree_height_rh(Huff_Node *root, int height){
  108. // stop recursio-
  109. if(root == NULL){
  110. return height;
  111. }
  112. // recurse
  113. int left_height = get_tree_height_rh(root->left, height + 1);
  114. int right_height = get_tree_height_rh(root->right, height + 1);
  115. // return the larger value
  116. if(left_height < right_height){
  117. return right_height;
  118. }
  119. return left_height;
  120. }
  121. // counts the number of leaf nodes in the tree root
  122. int get_num_leaves(Huff_Node *root){
  123. int leaves = 0;
  124. get_num_leaves_rh(root, &leaves);
  125. return leaves;
  126. }
  127. // recursive helper function for get_num_leaves
  128. void get_num_leaves_rh(Huff_Node *root, int *leaves){
  129. if(root == NULL){
  130. return;
  131. }
  132. if(root->ascii >= 0){
  133. (*leaves)++;
  134. return;
  135. }
  136. get_num_leaves_rh(root->left, leaves);
  137. get_num_leaves_rh(root->right, leaves);
  138. }
  139. // generate the codebook and return number of characters
  140. void gen_codes(Huff_Node * root, int ** codebook){
  141. int row = 0;
  142. gen_codes_rh(root, codebook, &row, 1);
  143. }
  144. // recursive helper function to generate the codebook
  145. void gen_codes_rh(Huff_Node *root, int ** codebook, int * row, int col){
  146. int numRow = 0;
  147. int i = 0;
  148. // recurse through tree
  149. if(root == NULL){
  150. return;
  151. }
  152. Huff_Node * left_child = root->left;
  153. Huff_Node * right_child = root->right;
  154. if(root->ascii >= 0){ // this node is a leaf node
  155. codebook[*row][0] = root->ascii;
  156. (*row)++;
  157. return;
  158. }
  159. if(left_child != NULL){
  160. numRow = get_num_leaves(left_child);
  161. for(i = *row; i < (*row) + numRow; i++){
  162. codebook[i][col] = 0;
  163. }
  164. gen_codes_rh(left_child, codebook, row, col + 1);
  165. }
  166. if(right_child != NULL){
  167. numRow = get_num_leaves(right_child);
  168. for(i = *row; i < (*row) + numRow; i++){
  169. codebook[i][col] = 1;
  170. }
  171. gen_codes_rh(right_child, codebook, row, col + 1);
  172. }
  173. }
  174. // print the codes to stdout in a human-readable format
  175. void print_codes(int ** codebook, int numRow){
  176. int row = 0;
  177. for(row = 0; row < numRow; row++){
  178. printf("ascii: %c\t", codebook[row][0]);
  179. int col = 1;
  180. printf("code: ");
  181. while(codebook[row][col] != -1){
  182. printf("%d", codebook[row][col]);
  183. col++;
  184. }
  185. printf("\n");
  186. }
  187. }
  188. void write_bit(FILE *fptr, char bit, unsigned char *bit_pos, unsigned char *cur_byte){
  189. if((*bit_pos) == 0){
  190. *cur_byte = 0;
  191. }
  192. unsigned char shifted_bit = bit << (ASCII_BYTE_SIZE - 1 - (*bit_pos));
  193. *cur_byte |= shifted_bit;
  194. if((*bit_pos) == ASCII_BYTE_SIZE - 1){
  195. fputc(*cur_byte, fptr);
  196. }
  197. *bit_pos = ((*bit_pos) + 1) % ASCII_BYTE_SIZE;
  198. }
  199. void read_bit(FILE *fptr, unsigned char *bit, unsigned char *bit_pos, unsigned char * cur_byte){
  200. if((*bit_pos) == 0){
  201. if(fread(cur_byte, ASCII_BYTE_SIZE, 1, fptr) != 1){
  202. return;
  203. }
  204. }
  205. unsigned char shifted_bit = (*cur_byte) >> (ASCII_BYTE_SIZE - 1 - (*bit_pos));
  206. shifted_bit = shifted_bit & 0x01;
  207. (*bit_pos) = ((*bit_pos) + 1) % ASCII_BYTE_SIZE;
  208. *bit = shifted_bit;
  209. return;
  210. }
  211. void write_file_header(FILE *fptr, int *weights){
  212. int i = 0;
  213. for(i = 0; i < ASCII_SIZE; i++){
  214. if(weights[i] > 0){
  215. fprintf(fptr, "%c%d", i, weights[i]);
  216. }
  217. }
  218. fprintf(fptr, "\n");
  219. }
  220. Huff_Node * read_file_header(FILE *fptr){
  221. Huff_Node *head = NULL;
  222. Huff_Node *node = NULL;
  223. char ascii = 0;
  224. int weight = 0;
  225. int weight_total = 0;
  226. while(fscanf(fptr, "%c%d", &ascii, &weight) > 1){
  227. node = create_huff_node(ascii, weight);
  228. weight_total = weight_total + weight;
  229. add_list_node(&head, node);
  230. }
  231. Huff_Node * root = gen_huff_tree(head, weight_total);
  232. return root;
  233. }
  234. void write_file_header_bits(FILE *fptr, Huff_Node *root, int weight_total, unsigned int num_leaves){
  235. unsigned char bit_pos = 0;
  236. unsigned char cur_byte = 0;
  237. fwrite(&weight_total, sizeof(int), 1, fptr);
  238. write_file_header_bits_rh(fptr, root, &bit_pos, &cur_byte);
  239. write_bit(fptr, 0, &bit_pos, &cur_byte);
  240. // pad zeros
  241. while(bit_pos != 0){
  242. write_bit(fptr, 0, &bit_pos, &cur_byte);
  243. }
  244. fwrite(&num_leaves, sizeof(unsigned int), 1, fptr);
  245. unsigned char line_break = '\n';
  246. fwrite(&line_break, sizeof(unsigned int), 1, fptr);
  247. }
  248. void write_file_header_bits_rh(FILE *fptr, Huff_Node *root, unsigned char *bit_pos, unsigned char *cur_byte){
  249. if(root == NULL){
  250. printf("Error.");
  251. }
  252. Huff_Node *left = root->left;
  253. Huff_Node *right = root->right;
  254. if((left == NULL) && (right == NULL)){
  255. write_bit(fptr, 1, bit_pos, cur_byte);
  256. write_ascii_bits(fptr, root->ascii, bit_pos, cur_byte);
  257. return;
  258. }
  259. write_file_header_bits_rh(fptr, left, bit_pos, cur_byte);
  260. write_file_header_bits_rh(fptr, right, bit_pos, cur_byte);
  261. write_bit(fptr, 0, bit_pos, cur_byte);
  262. }
  263. void write_ascii_bits(FILE *fptr, char ascii, unsigned char *bit_pos, unsigned char *cur_byte){
  264. unsigned char mask = 0x40;
  265. while(mask > 0){
  266. write_bit(fptr, (ascii & mask) == mask, bit_pos, cur_byte);
  267. mask >>= 1;
  268. }
  269. }
  270. void map_codebook(int ** codebook, int size, int * map){
  271. int i = 0;
  272. int j = 0;
  273. for(i = 0; i < ASCII_SIZE; i++){
  274. map[i] = -1;
  275. for(j = 0; j < size; j++){
  276. if(codebook[j][0] == i){
  277. map[i] = j;
  278. }
  279. }
  280. }
  281. }
  282. void write_file_data(FILE *fptr_out, FILE *fptr_in, int ** codebook, int * map){
  283. char ch = 0;
  284. int ascii_i = 0;
  285. int code_i = 0;
  286. while((ch = fgetc(fptr_in)) != EOF){
  287. ascii_i = map[(int)ch];
  288. code_i = 1;
  289. while(codebook[ascii_i][code_i] != -1){
  290. fprintf(fptr_out, "%d", codebook[ascii_i][code_i]);
  291. code_i++;
  292. }
  293. }
  294. }
  295. int main (int argc, char **argv){
  296. if(argc != 2){
  297. printf("Invalid number of arguments.\n");
  298. return EXIT_FAILURE;
  299. }
  300. Huff_Node *head = NULL;
  301. int weights[ASCII_SIZE] = {0};
  302. int weight_total = 0;
  303. int map[ASCII_SIZE] = {-1};
  304. count_weights(argv[1], weights);
  305. weight_total = build_list(&head, weights);
  306. Huff_Node * root = gen_huff_tree(head, weight_total);
  307. int numRow = get_num_leaves(root);
  308. int numCol = (ASCII_BYTE_SIZE + 1);
  309. int ** codebook = malloc(sizeof(int *) * numRow);
  310. int row = 0;
  311. int col = 0;
  312. for(row = 0; row < numRow; row++){
  313. codebook[row] = malloc(sizeof(int) * numCol);
  314. for(col = 0; col < numCol; col++){
  315. codebook[row][col] = NO_BIT;
  316. }
  317. }
  318. gen_codes(root, codebook);
  319. map_codebook(codebook, numRow, map);
  320. char input[256];
  321. strcpy(input, argv[1]);
  322. strcat(argv[1], ".huff");
  323. FILE *fptr_in = fopen(input, "r");
  324. FILE *fptr_out = fopen(argv[1], "w");
  325. write_file_header(fptr_out, weights);
  326. fclose(fptr_out);
  327. FILE *data_out = fopen(argv[1], "a");
  328. write_file_data(data_out, fptr_in, codebook, map);
  329. //printf("%s", input);
  330. fclose(data_out);
  331. fclose(fptr_in);
  332. return EXIT_SUCCESS;
  333. }