From 60d8370497e74cb4420cad46a0d1bb19af2bb741 Mon Sep 17 00:00:00 2001 From: Wolfgang Spraul Date: Mon, 30 Jul 2012 09:11:56 +0200 Subject: [PATCH] added small text utilities hstrrep, sort_seq and merge_seq --- .gitignore | 4 + Makefile | 6 +- README | 2 + hstrrep.c | 114 ++++++++++++++++++ merge_seq.c | 324 ++++++++++++++++++++++++++++++++++++++++++++++++++++ sort_seq.c | 147 ++++++++++++++++++++++++ 6 files changed, 596 insertions(+), 1 deletion(-) create mode 100644 hstrrep.c create mode 100644 merge_seq.c create mode 100644 sort_seq.c diff --git a/.gitignore b/.gitignore index ef3df0c..e674312 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,7 @@ new_fp new_fp.o hstrrep hstrrep.o +sort_seq +sort_seq.o +merge_seq +merge_seq.o diff --git a/Makefile b/Makefile index 9ed94ac..5867e4c 100644 --- a/Makefile +++ b/Makefile @@ -10,7 +10,7 @@ CFLAGS = -Wall -g LDLIBS = -lxml2 -all: bit2txt draw_svg_tiles new_fp hstrrep xc6slx9.svg xc6slx9.fp +all: bit2txt draw_svg_tiles new_fp hstrrep sort_seq merge_seq xc6slx9.svg xc6slx9.fp xc6slx9.svg: draw_svg_tiles ./draw_svg_tiles | xmllint --pretty 1 - > $@ @@ -36,6 +36,10 @@ new_fp: new_fp.o model.o helper.o hstrrep: hstrrep.o helper.o +sort_seq: sort_seq.o + +merge_seq: merge_seq.o + clean: rm -f bit2txt bit2txt.o \ draw_svg_tiles draw_svg_tiles.o \ diff --git a/README b/README index 0c6308b..f360c9b 100644 --- a/README +++ b/README @@ -14,3 +14,5 @@ Utilities - fp2bit converts .fp floorplan into .bit bitstream - bit2txt dumps .bit bitstream as text - hstrrep high-speed hashed array based search and replace util +- sort_seq sorts line-based text file by sequence numbers in strings +- merge_seq merges a pre-sorted text file into wire sequences diff --git a/hstrrep.c b/hstrrep.c new file mode 100644 index 0000000..94ad473 --- /dev/null +++ b/hstrrep.c @@ -0,0 +1,114 @@ +// +// Author: Wolfgang Spraul +// +// This is free and unencumbered software released into the public domain. +// For details see the UNLICENSE file at the root of the source tree. +// + +#include +#include +#include +#include +#include +#include + +#include "helper.h" + +int main(int argc, char** argv) +{ + char line[1024], search_str[1024], replace_str[1024]; + char* next_word, *lasts; + const char* replace_ptr; + struct hashed_strarray search_arr, replace_arr; + FILE* fp = 0; + int i, rc, search_idx; + + if (argc < 3) { + fprintf(stderr, + "\n" + "hstrrep - hashed string replace\n" + "Usage: %s \n" + " token_file has 2 words per line: First word is string\n" + " that is to be replaced, second word is the replacement.\n", argv[0]); + goto xout; + } + + if (strarray_init(&search_arr, STRIDX_1M)) { + fprintf(stderr, "Out of memory in %s:%i\n", __FILE__, __LINE__); + goto xout; + } + if (strarray_init(&replace_arr, STRIDX_1M)) { + fprintf(stderr, "Out of memory in %s:%i\n", __FILE__, __LINE__); + goto xout; + } + + // + // Read search and replace tokens into hashed strarray + // + + fp = fopen(argv[2], "r"); + if (!fp) { + fprintf(stderr, "Error opening %s.\n", argv[2]); + goto xout; + } + while (fgets(line, sizeof(line), fp)) { + memset(replace_str, 0, sizeof(replace_str)); + i = sscanf(line, " %[^ ] %1023c", search_str, replace_str); + if (i == 2) { + i = strlen(replace_str); + if (i && replace_str[i-1] == '\n') + replace_str[i-1] = 0; + rc = strarray_add(&search_arr, search_str, &search_idx); + if (rc) { + fprintf(stderr, "Out of memory in %s:%i\n", __FILE__, __LINE__); + goto xout; + } + rc = strarray_stash(&replace_arr, replace_str, search_idx); + if (rc) { + fprintf(stderr, "Out of memory in %s:%i\n", __FILE__, __LINE__); + goto xout; + } + } + } + fclose(fp); + + // + // Go through data file and search and replace + // + + fp = fopen(argv[1], "r"); + if (!fp) { + fprintf(stderr, "Error opening %s.\n", argv[1]); + goto xout; + } + while (fgets(line, sizeof(line), fp)) { + next_word = strtok_r(line, " \n", &lasts); + if (next_word) { + do { + rc = strarray_find(&search_arr, next_word, &search_idx); + if (rc) { + fprintf(stderr, "Internal error in %s:%i\n", __FILE__, __LINE__); + goto xout; + } + if (search_idx == STRIDX_NO_ENTRY) + fputs(next_word, stdout); + else { + replace_ptr = strarray_lookup(&replace_arr, search_idx); + if (!replace_ptr) { + fprintf(stderr, "Internal error in %s:%i\n", __FILE__, __LINE__); + goto xout; + } + fputs(replace_ptr, stdout); + } + next_word = strtok_r(0, " \n", &lasts); + if (next_word) + putchar(' '); + } while ( next_word ); + putchar('\n'); + } + } + fclose(fp); + return EXIT_SUCCESS; +xout: + return EXIT_FAILURE; +} diff --git a/merge_seq.c b/merge_seq.c new file mode 100644 index 0000000..6046b47 --- /dev/null +++ b/merge_seq.c @@ -0,0 +1,324 @@ +// +// Author: Wolfgang Spraul +// +// This is free and unencumbered software released into the public domain. +// For details see the UNLICENSE file at the root of the source tree. +// + +#include +#include +#include + +#define LINE_LENGTH 1024 + +struct line_buf +{ + // buf[0] == 0 signals 'no line' + char buf[LINE_LENGTH]; + // left_digit_start_o and right_digit_start_o will be -1 + // if left/right is not initialized. + int left_digit_start_o, left_digit_end_o, left_digit_base; + int right_digit_start_o, right_digit_end_o, right_digit_base; + // sequence_size == 0 means no sequence detected, 1 means + // two members in sequence (e.g. 0:1), etc. + int sequence_size; +}; + +static int print_line(const struct line_buf* line) +{ + char buf[LINE_LENGTH]; + + if (!line->buf[0]) return 0; + if (!line->sequence_size || line->left_digit_start_o < 0) { + printf(line->buf); + return 0; + } + if (line->right_digit_start_o < 0) + snprintf(buf, sizeof(buf), "%.*s%i:%i%s", + line->left_digit_start_o, + line->buf, + line->left_digit_base, + line->left_digit_base+line->sequence_size, + &line->buf[line->left_digit_end_o]); + else + snprintf(buf, sizeof(buf), "%.*s%i:%i%.*s%i:%i%s", + line->left_digit_start_o, + line->buf, + line->left_digit_base, + line->left_digit_base+line->sequence_size, + line->right_digit_start_o-line->left_digit_end_o, + &line->buf[line->left_digit_end_o], + line->right_digit_base, + line->right_digit_base+line->sequence_size, + &line->buf[line->right_digit_end_o]); + printf(buf); + return 0; +} + +static void next_word(const char*s, int start, int* beg, int* end) +{ + int i = start; + while (s[i] == ' ' || s[i] == '\t' || s[i] == '\n') i++; + *beg = i; + while (s[i] != ' ' && s[i] != '\t' && s[i] != '\n' && s[i]) i++; + *end = i; +} + +static int to_i(const char* s, int len) +{ + int num, base; + for (base = 1, num = 0; len; num += base*(s[--len]-'0'), base *= 10); + return num; +} + +// Finds the positions of two non-equal numbers that must meet +// two number of criteria: +// - prefixed by at least one capital 'A'-'Z' or '_' +// - suffixed by matching or empty strings +static void find_non_matching_number(const char* a, int a_len, + const char* b, int b_len, int* ab_start, int* a_end, int* b_end) +{ + int a_o, b_o, digit_start, a_num, b_num; + + *ab_start = -1; + a_o = 0; + + // from the left side, search for the first non-matching + // character + while (a[a_o] == b[a_o] && a_o < a_len && a_o < b_len) + a_o++; + + // if the strings match entirely, return + if (a_o >= a_len && a_o >= b_len) return; + + // If neither of the non-matching characters is a digit, return + if ((a[a_o] < '0' || a[a_o] > '9') + && (b[a_o] < '0' || b[a_o] > '9')) + return; + + // go back to beginning of numeric section + // (first and second must be identical going backwards) + while (a_o && a[a_o-1] >= '0' && a[a_o-1] <= '9') + a_o--; + + // If there is not at least one capital 'A'-'Z' or '_' + // before the number, return + if (!a_o + || ((a[a_o-1] < 'A' || a[a_o-1] > 'Z') + && a[a_o-1] != '_')) return; + + // now skip over all digits in left and right string + digit_start = a_o; + while (a[a_o] >= '0' && a[a_o] <= '9' && a_o < a_len) + a_o++; + b_o = digit_start; + while (b[b_o] >= '0' && b[b_o] <= '9' && b_o < b_len) + b_o++; + + // there must be at least one digit on each side + if (a_o <= digit_start || b_o <= digit_start) return; + + a_num = to_i(&a[digit_start], a_o-digit_start); + b_num = to_i(&b[digit_start], b_o-digit_start); + if (a_num == b_num) { + fprintf(stderr, "Strange parsing issue with '%.*s' and '%.*s'\n", a_len, a, b_len, b); + return; + } + + // the trailing part after the two numbers must match + if (a_len - a_o != b_len - b_o) return; + if ((a_len - a_o) && strncmp(&a[a_o], &b[b_o], a_len-a_o)) return; + + *ab_start = digit_start; + *a_end = a_o; + *b_end = b_o; +} + +static int merge_line(struct line_buf* first_l, struct line_buf* second_l) +{ + int first_o, second_o, fs_start, f_end, s_end, first_num, second_num; + int first_eow, second_eow, f_start, s_start; + int left_start, left_end, left_num; + + if (!first_l->buf[0] || !second_l->buf[0]) return 0; + // go through word by word, find first non-equal word + first_o = 0; + second_o = 0; + while (1) { + next_word(first_l->buf, first_o, &first_o, &first_eow); + next_word(second_l->buf, second_o, &second_o, &second_eow); + if (first_eow <= first_o || second_eow <= second_o) return 0; + if (first_eow-first_o != second_eow-second_o + || strncmp(&first_l->buf[first_o], &second_l->buf[second_o], first_eow-first_o)) + break; + first_o = first_eow; + second_o = second_eow; + } + // non-matching number inside? + fs_start = -1; + find_non_matching_number(&first_l->buf[first_o], first_eow-first_o, + &second_l->buf[second_o], second_eow-second_o, + &fs_start, &f_end, &s_end); + if (fs_start == -1) return 0; // no: cannot merge + f_start = first_o+fs_start; + f_end += first_o; + s_start = second_o+fs_start; + s_end += second_o; + first_o = first_eow; + second_o = second_eow; + + // in sequence? if not, cannot merge + second_num = to_i(&second_l->buf[s_start], s_end-s_start); + if (first_l->sequence_size) { + if (first_l->left_digit_start_o < 0) { + fprintf(stderr, "Internal error in %s:%i\n", __FILE__, __LINE__); + return -1; + } + if (second_num != first_l->left_digit_base + first_l->sequence_size + 1) + return 0; + } else { + first_num = to_i(&first_l->buf[f_start], f_end-f_start); + if (second_num != first_num + 1) + return 0; + } + + // find next non-equal word + while (1) { + next_word(first_l->buf, first_o, &first_o, &first_eow); + next_word(second_l->buf, second_o, &second_o, &second_eow); + if (first_eow <= first_o && second_eow <= second_o) { + // reached end of line + if (first_l->sequence_size) { + if (first_l->right_digit_start_o != -1) return 0; + first_l->sequence_size++; + } else { + first_l->left_digit_start_o = f_start; + first_l->left_digit_end_o = f_end; + first_l->left_digit_base = first_num; + first_l->right_digit_start_o = -1; + first_l->sequence_size = 1; + } + second_l->buf[0] = 0; + return 0; + } + if (first_eow <= first_o || second_eow <= second_o) return 0; + if (first_eow-first_o != second_eow-second_o + || strncmp(&first_l->buf[first_o], &second_l->buf[second_o], first_eow-first_o)) + break; + first_o = first_eow; + second_o = second_eow; + } + + // now we must find a second number matching the sequence + left_start = f_start; + left_end = f_end; + left_num = first_num; + + // non-matching number inside? + fs_start = -1; + find_non_matching_number(&first_l->buf[first_o], first_eow-first_o, + &second_l->buf[second_o], second_eow-second_o, + &fs_start, &f_end, &s_end); + if (fs_start == -1) return 0; // no: cannot merge + f_start = first_o+fs_start; + f_end += first_o; + s_start = second_o+fs_start; + s_end += second_o; + first_o = first_eow; + second_o = second_eow; + + // in sequence? if not, cannot merge + second_num = to_i(&second_l->buf[s_start], s_end-s_start); + if (first_l->sequence_size) { + if (first_l->right_digit_start_o < 0 + || second_num != first_l->right_digit_base + first_l->sequence_size + 1) + return 0; + } else { + first_num = to_i(&first_l->buf[f_start], f_end-f_start); + if (second_num != first_num + 1) + return 0; + } + + // find next non-equal word + while (1) { + next_word(first_l->buf, first_o, &first_o, &first_eow); + next_word(second_l->buf, second_o, &second_o, &second_eow); + if (first_eow <= first_o && second_eow <= second_o) { + // reached end of line + if (first_l->sequence_size) + first_l->sequence_size++; + else { + first_l->left_digit_start_o = left_start; + first_l->left_digit_end_o = left_end; + first_l->left_digit_base = left_num; + first_l->right_digit_start_o = f_start; + first_l->right_digit_end_o = f_end; + first_l->right_digit_base = first_num; + first_l->sequence_size = 1; + } + second_l->buf[0] = 0; + return 0; + } + if (first_eow <= first_o || second_eow <= second_o) return 0; + if (first_eow-first_o != second_eow-second_o + || strncmp(&first_l->buf[first_o], &second_l->buf[second_o], first_eow-first_o)) + break; + first_o = first_eow; + second_o = second_eow; + } + // found another non-matching word, cannot merge + return 0; +} + +int main(int argc, char** argv) +{ + struct line_buf first_line, second_line; + FILE* fp = 0; + int rc; + + if (argc < 2) { + fprintf(stderr, + "merge_seq - merge sequence (needs presorted file)\n" + "Usage: %s \n", argv[0]); + goto xout; + } + fp = fopen(argv[1], "r"); + if (!fp) { + fprintf(stderr, "Error opening %s.\n", argv[1]); + goto xout; + } + + // read first line + first_line.buf[0] = 0; + first_line.left_digit_start_o = -1; + first_line.right_digit_start_o = -1; + first_line.sequence_size = 0; + if (!fgets(first_line.buf, sizeof(first_line.buf), fp) + || !first_line.buf[0]) goto out; + + while (1) { + // read second line + second_line.buf[0] = 0; + second_line.left_digit_start_o = -1; + second_line.right_digit_start_o = -1; + second_line.sequence_size = 0; + if (!fgets(second_line.buf, sizeof(second_line.buf), fp)) + break; + // can the two be merged? + rc = merge_line(&first_line, &second_line); + if (rc) goto xout; + if (second_line.buf[0]) { + // no: print first line and move second line to first + rc = print_line(&first_line); + if (rc) goto xout; + first_line = second_line; + } + } + rc = print_line(&first_line); + if (rc) goto xout; +out: + fclose(fp); + return EXIT_SUCCESS; +xout: + return rc; +} diff --git a/sort_seq.c b/sort_seq.c new file mode 100644 index 0000000..d0c24cd --- /dev/null +++ b/sort_seq.c @@ -0,0 +1,147 @@ +// +// Author: Wolfgang Spraul +// +// This is free and unencumbered software released into the public domain. +// For details see the UNLICENSE file at the root of the source tree. +// + +#include +#include +#include + +#define LINE_LENGTH 1024 + +static int s_numlines; +static char s_lines[1000][LINE_LENGTH]; + +static int is_known_suffix(const char* str) +{ + static const char known_suffix[32][16] = + { "_S0", "_N3", "_INT0", "_INT1", "_INT2", "_INT3", + "_TEST", "_BRK", "_BUF", "_FOLD", "_BRAM", "_BRAM_INTER", + "_CLB", "_DSP", "_INT", "_MCB", "_DOWN", "_UP", + "_E", "_W", "_S", "_N", "_M", "_EXT", "_PINW", + "" }; + int i; + + if (str[0] != '_') return 0; + for (i = 0; known_suffix[i][0]; i++) { + if (!strcmp(known_suffix[i], str)) + return 1; + } + return 0; +} + +static void copy_word(char* buf, const char* s) +{ + int i = 0; + while (s[i] != ' ' && s[i] != '\t' && s[i] != '\n' && s[i]) { + buf[i] = s[i]; + i++; + } + buf[i] = 0; +} + +int sort_lines(const void* a, const void* b) +{ + const char* _a, *_b; + int i, a_i, b_i, a_num, b_num, rc; + char a_word[1024], b_word[1024]; + + _a = a; + _b = b; + + // search first non-matching character + for (i = 0; _a[i] && _a[i] == _b[i]; i++); + + // if entire string matches, return 0 + if (!_a[i] && !_b[i]) return 0; + + // if neither of the non-matching characters is a digit, return + if ((_a[i] < '0' || _a[i] > '9') + && (_b[i] < '0' || _b[i] > '9')) + return _a[i] - _b[i]; + + // go back to beginning of numeric section + // (a and b must be identical going backwards) + while (i && _a[i-1] >= '0' && _a[i-1] <= '9') + i--; + + // go forward to first non-digit + for (a_i = i; _a[a_i] >= '0' && _a[a_i] <= '9'; a_i++ ); + for (b_i = i; _b[b_i] >= '0' && _b[b_i] <= '9'; b_i++ ); + + // there must be at least one digit on each side + if (a_i <= i || b_i <= i) { + // We move numbers before all other characters. + if (_a[i] >= '0' && _a[i] <= '9' + && (_b[i] < '0' || _b[i] > '9')) return 1; + if (_b[i] >= '0' && _b[i] <= '9' + && (_a[i] < '0' || _a[i] > '9')) return -1; + return _a[i] - _b[i]; + } + + // for known suffixes, the suffix comes before the number + copy_word(a_word, &_a[a_i]); + copy_word(b_word, &_b[b_i]); + if ((!a_word[0] || is_known_suffix(a_word)) + && (!b_word[0] || is_known_suffix(b_word))) { + rc = strcmp(a_word, b_word); + if (rc) return rc; + } + + a_num = strtol(&_a[i], 0 /* endptr */, 10); + b_num = strtol(&_b[i], 0 /* endptr */, 10); + if (a_num != b_num) + return a_num - b_num; + + return strcmp(&_a[a_i], &_b[b_i]); +} + +int main(int argc, char** argv) +{ + FILE* fp = 0; + int i; + + if (argc < 2) { + fprintf(stderr, + "sort_seq - sort by sequence\n" + "Usage: %s \n", argv[0]); + goto xout; + } + fp = fopen(argv[1], "r"); + if (!fp) { + fprintf(stderr, "Error opening %s.\n", argv[1]); + goto xout; + } + s_numlines = 0; + // read 200 lines to beginning of buffer + while (s_numlines < 200 + && fgets(s_lines[s_numlines], sizeof(s_lines[0]), fp)) + s_numlines++; + while (1) { + // read another 800 lines + while (s_numlines < 1000 + && fgets(s_lines[s_numlines], sizeof(s_lines[0]), fp)) + s_numlines++; + if (!s_numlines) break; + // sort 1000 lines + qsort(s_lines, s_numlines, sizeof(s_lines[0]), sort_lines); + // print first 800 lines + for (i = 0; i < 800; i++) { + if (i >= s_numlines) break; + printf(s_lines[i]); + } + // move up last 200 lines to beginning of buffer + if (s_numlines > i) { + memmove(s_lines[0], s_lines[i], + (s_numlines-i)*sizeof(s_lines[0])); + s_numlines -= i; + } else + s_numlines = 0; + } + fclose(fp); + return EXIT_SUCCESS; +xout: + return EXIT_FAILURE; +}