From 60d8370497e74cb4420cad46a0d1bb19af2bb741 Mon Sep 17 00:00:00 2001
From: Wolfgang Spraul <wspraul@q-ag.de>
Date: Mon, 30 Jul 2012 09:11:56 +0200
Subject: [PATCH] added small text utilities hstrrep, sort_seq and merge_seq

---
 .gitignore  |   4 +
 Makefile    |   6 +-
 README      |   2 +
 hstrrep.c   | 114 ++++++++++++++++++
 merge_seq.c | 324 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 sort_seq.c  | 147 ++++++++++++++++++++++++
 6 files changed, 596 insertions(+), 1 deletion(-)
 create mode 100644 hstrrep.c
 create mode 100644 merge_seq.c
 create mode 100644 sort_seq.c

diff --git a/.gitignore b/.gitignore
index ef3df0c..e674312 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,3 +8,7 @@ new_fp
 new_fp.o
 hstrrep
 hstrrep.o
+sort_seq
+sort_seq.o
+merge_seq
+merge_seq.o
diff --git a/Makefile b/Makefile
index 9ed94ac..5867e4c 100644
--- a/Makefile
+++ b/Makefile
@@ -10,7 +10,7 @@
 CFLAGS = -Wall -g
 LDLIBS = -lxml2
 
-all: bit2txt draw_svg_tiles new_fp hstrrep xc6slx9.svg xc6slx9.fp
+all: bit2txt draw_svg_tiles new_fp hstrrep sort_seq merge_seq xc6slx9.svg xc6slx9.fp
 
 xc6slx9.svg: draw_svg_tiles
 	./draw_svg_tiles | xmllint --pretty 1 - > $@
@@ -36,6 +36,10 @@ new_fp: new_fp.o model.o helper.o
 
 hstrrep: hstrrep.o helper.o
 
+sort_seq: sort_seq.o
+
+merge_seq: merge_seq.o
+
 clean:
 		rm -f bit2txt bit2txt.o \
 			draw_svg_tiles draw_svg_tiles.o \
diff --git a/README b/README
index 0c6308b..f360c9b 100644
--- a/README
+++ b/README
@@ -14,3 +14,5 @@ Utilities
 - fp2bit         converts .fp floorplan into .bit bitstream
 - bit2txt        dumps .bit bitstream as text
 - hstrrep        high-speed hashed array based search and replace util
+- sort_seq       sorts line-based text file by sequence numbers in strings
+- merge_seq      merges a pre-sorted text file into wire sequences
diff --git a/hstrrep.c b/hstrrep.c
new file mode 100644
index 0000000..94ad473
--- /dev/null
+++ b/hstrrep.c
@@ -0,0 +1,114 @@
+//
+// Author: Wolfgang Spraul
+//
+// This is free and unencumbered software released into the public domain.
+// For details see the UNLICENSE file at the root of the source tree.
+//
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <assert.h>
+#include <sys/stat.h>
+
+#include "helper.h"
+
+int main(int argc, char** argv)
+{
+	char line[1024], search_str[1024], replace_str[1024];
+	char* next_word, *lasts;
+	const char* replace_ptr;
+	struct hashed_strarray search_arr, replace_arr;
+	FILE* fp = 0;
+	int i, rc, search_idx;
+
+	if (argc < 3) {
+		fprintf(stderr,
+			"\n"
+			"hstrrep - hashed string replace\n"
+			"Usage: %s <data_file> <token_file>\n"
+			"  token_file has 2 words per line: First word is string\n"
+			"  that is to be replaced, second word is the replacement.\n", argv[0]);
+		goto xout;
+	}
+
+	if (strarray_init(&search_arr, STRIDX_1M)) {
+		fprintf(stderr, "Out of memory in %s:%i\n", __FILE__, __LINE__);
+		goto xout;
+	}
+	if (strarray_init(&replace_arr, STRIDX_1M)) {
+		fprintf(stderr, "Out of memory in %s:%i\n", __FILE__, __LINE__);
+		goto xout;
+	}
+
+	//
+	// Read search and replace tokens into hashed strarray
+	//
+
+	fp = fopen(argv[2], "r");
+	if (!fp) {
+		fprintf(stderr, "Error opening %s.\n", argv[2]);
+		goto xout;
+	}
+	while (fgets(line, sizeof(line), fp)) {
+		memset(replace_str, 0, sizeof(replace_str));
+		i = sscanf(line, " %[^ ] %1023c", search_str, replace_str);
+		if (i == 2) {
+			i = strlen(replace_str);
+			if (i && replace_str[i-1] == '\n')
+				replace_str[i-1] = 0;
+			rc = strarray_add(&search_arr, search_str, &search_idx);
+			if (rc) {
+				fprintf(stderr, "Out of memory in %s:%i\n", __FILE__, __LINE__);
+				goto xout;
+			}
+			rc = strarray_stash(&replace_arr, replace_str, search_idx);
+			if (rc) {
+				fprintf(stderr, "Out of memory in %s:%i\n", __FILE__, __LINE__);
+				goto xout;
+			}
+		}
+	}
+	fclose(fp);
+
+	//
+	// Go through data file and search and replace
+	//
+
+	fp = fopen(argv[1], "r");
+	if (!fp) {
+		fprintf(stderr, "Error opening %s.\n", argv[1]);
+		goto xout;
+	}
+	while (fgets(line, sizeof(line), fp)) {
+		next_word = strtok_r(line, " \n", &lasts);
+		if (next_word) {
+			do {
+				rc = strarray_find(&search_arr, next_word, &search_idx);
+				if (rc) {
+					fprintf(stderr, "Internal error in %s:%i\n", __FILE__, __LINE__);
+					goto xout;
+				}
+				if (search_idx == STRIDX_NO_ENTRY)
+					fputs(next_word, stdout);
+				else {
+					replace_ptr = strarray_lookup(&replace_arr, search_idx);
+					if (!replace_ptr) {
+						fprintf(stderr, "Internal error in %s:%i\n", __FILE__, __LINE__);
+						goto xout;
+					}
+					fputs(replace_ptr, stdout);
+				}
+				next_word = strtok_r(0, " \n", &lasts);
+				if (next_word)
+					putchar(' ');
+			} while ( next_word );
+			putchar('\n');
+		}
+	}
+	fclose(fp);
+	return EXIT_SUCCESS;
+xout:
+	return EXIT_FAILURE;
+}
diff --git a/merge_seq.c b/merge_seq.c
new file mode 100644
index 0000000..6046b47
--- /dev/null
+++ b/merge_seq.c
@@ -0,0 +1,324 @@
+//
+// Author: Wolfgang Spraul
+//
+// This is free and unencumbered software released into the public domain.
+// For details see the UNLICENSE file at the root of the source tree.
+//
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+#define LINE_LENGTH	1024
+
+struct line_buf
+{
+	// buf[0] == 0 signals 'no line'
+	char buf[LINE_LENGTH];
+	// left_digit_start_o and right_digit_start_o will be -1
+	// if left/right is not initialized.
+	int left_digit_start_o, left_digit_end_o, left_digit_base;
+	int right_digit_start_o, right_digit_end_o, right_digit_base;
+	// sequence_size == 0 means no sequence detected, 1 means
+	// two members in sequence (e.g. 0:1), etc.
+	int sequence_size;
+};
+
+static int print_line(const struct line_buf* line)
+{
+	char buf[LINE_LENGTH];
+
+	if (!line->buf[0]) return 0;
+	if (!line->sequence_size || line->left_digit_start_o < 0) {
+		printf(line->buf);
+		return 0;
+	}
+	if (line->right_digit_start_o < 0)
+		snprintf(buf, sizeof(buf), "%.*s%i:%i%s",
+			line->left_digit_start_o,
+			line->buf,
+			line->left_digit_base,
+			line->left_digit_base+line->sequence_size,
+			&line->buf[line->left_digit_end_o]);
+	else
+		snprintf(buf, sizeof(buf), "%.*s%i:%i%.*s%i:%i%s",
+			line->left_digit_start_o,
+			line->buf,
+			line->left_digit_base,
+			line->left_digit_base+line->sequence_size,
+			line->right_digit_start_o-line->left_digit_end_o,
+			&line->buf[line->left_digit_end_o],
+			line->right_digit_base,
+			line->right_digit_base+line->sequence_size,
+			&line->buf[line->right_digit_end_o]);
+	printf(buf);
+	return 0;
+}
+
+static void next_word(const char*s, int start, int* beg, int* end)
+{
+	int i = start;
+	while (s[i] == ' ' || s[i] == '\t' || s[i] == '\n') i++;
+	*beg = i;
+	while (s[i] != ' ' && s[i] != '\t' && s[i] != '\n' && s[i]) i++;
+	*end = i;
+}
+
+static int to_i(const char* s, int len)
+{
+	int num, base;
+	for (base = 1, num = 0; len; num += base*(s[--len]-'0'), base *= 10);
+	return num;
+}
+
+// Finds the positions of two non-equal numbers that must meet
+// two number of criteria:
+// - prefixed by at least one capital 'A'-'Z' or '_'
+// - suffixed by matching or empty strings
+static void find_non_matching_number(const char* a, int a_len,
+	const char* b, int b_len, int* ab_start, int* a_end, int* b_end)
+{
+	int a_o, b_o, digit_start, a_num, b_num;
+
+	*ab_start = -1;
+	a_o = 0;
+
+	// from the left side, search for the first non-matching
+	// character
+	while (a[a_o] == b[a_o] && a_o < a_len && a_o < b_len)
+		a_o++;
+	
+	// if the strings match entirely, return
+	if (a_o >= a_len && a_o >= b_len) return;
+
+	// If neither of the non-matching characters is a digit, return
+	if ((a[a_o] < '0' || a[a_o] > '9')
+	    && (b[a_o] < '0' || b[a_o] > '9'))
+		return;
+
+	// go back to beginning of numeric section
+	// (first and second must be identical going backwards)
+	while (a_o && a[a_o-1] >= '0' && a[a_o-1] <= '9')
+		a_o--;
+
+	// If there is not at least one capital 'A'-'Z' or '_'
+	// before the number, return
+	if (!a_o
+	    || ((a[a_o-1] < 'A' || a[a_o-1] > 'Z')
+		&& a[a_o-1] != '_')) return;
+
+	// now skip over all digits in left and right string
+	digit_start = a_o;
+	while (a[a_o] >= '0' && a[a_o] <= '9' && a_o < a_len)
+		a_o++;
+	b_o = digit_start;
+	while (b[b_o] >= '0' && b[b_o] <= '9' && b_o < b_len)
+		b_o++;
+
+	// there must be at least one digit on each side
+	if (a_o <= digit_start || b_o <= digit_start) return;
+
+	a_num = to_i(&a[digit_start], a_o-digit_start);
+	b_num = to_i(&b[digit_start], b_o-digit_start);
+	if (a_num == b_num) {
+		fprintf(stderr, "Strange parsing issue with '%.*s' and '%.*s'\n", a_len, a, b_len, b);
+		return;
+	}
+
+	// the trailing part after the two numbers must match
+	if (a_len - a_o != b_len - b_o) return;
+	if ((a_len - a_o) && strncmp(&a[a_o], &b[b_o], a_len-a_o)) return;
+
+	*ab_start = digit_start;
+	*a_end = a_o;
+	*b_end = b_o;
+}
+
+static int merge_line(struct line_buf* first_l, struct line_buf* second_l)
+{
+	int first_o, second_o, fs_start, f_end, s_end, first_num, second_num;
+	int first_eow, second_eow, f_start, s_start;
+	int left_start, left_end, left_num;
+
+	if (!first_l->buf[0] || !second_l->buf[0]) return 0;
+	// go through word by word, find first non-equal word
+	first_o = 0;
+	second_o = 0;
+	while (1) {
+		next_word(first_l->buf, first_o, &first_o, &first_eow);
+		next_word(second_l->buf, second_o, &second_o, &second_eow);
+		if (first_eow <= first_o || second_eow <= second_o) return 0;
+		if (first_eow-first_o != second_eow-second_o
+		    || strncmp(&first_l->buf[first_o], &second_l->buf[second_o], first_eow-first_o))
+			break;
+		first_o = first_eow;
+		second_o = second_eow;
+	}
+	// non-matching number inside?
+	fs_start = -1;
+	find_non_matching_number(&first_l->buf[first_o], first_eow-first_o,
+		&second_l->buf[second_o], second_eow-second_o,
+		&fs_start, &f_end, &s_end);
+	if (fs_start == -1) return 0; // no: cannot merge
+	f_start = first_o+fs_start;
+	f_end += first_o;
+	s_start = second_o+fs_start;
+	s_end += second_o;
+	first_o = first_eow;
+	second_o = second_eow;
+
+	// in sequence? if not, cannot merge
+	second_num = to_i(&second_l->buf[s_start], s_end-s_start);
+	if (first_l->sequence_size) {
+		if (first_l->left_digit_start_o < 0) {
+			fprintf(stderr, "Internal error in %s:%i\n", __FILE__, __LINE__);
+			return -1;
+		}
+		if (second_num != first_l->left_digit_base + first_l->sequence_size + 1)
+			return 0;
+	} else {
+		first_num = to_i(&first_l->buf[f_start], f_end-f_start);
+		if (second_num != first_num + 1)
+			return 0;
+	}
+
+	// find next non-equal word
+	while (1) {
+		next_word(first_l->buf, first_o, &first_o, &first_eow);
+		next_word(second_l->buf, second_o, &second_o, &second_eow);
+		if (first_eow <= first_o && second_eow <= second_o) {
+			// reached end of line
+			if (first_l->sequence_size) {
+				if (first_l->right_digit_start_o != -1) return 0;
+				first_l->sequence_size++;
+			} else {
+				first_l->left_digit_start_o = f_start;
+				first_l->left_digit_end_o = f_end;
+				first_l->left_digit_base = first_num;
+				first_l->right_digit_start_o = -1;
+				first_l->sequence_size = 1;
+			}
+			second_l->buf[0] = 0;
+			return 0;
+		}
+		if (first_eow <= first_o || second_eow <= second_o) return 0;
+		if (first_eow-first_o != second_eow-second_o
+		    || strncmp(&first_l->buf[first_o], &second_l->buf[second_o], first_eow-first_o))
+			break;
+		first_o = first_eow;
+		second_o = second_eow;
+	}
+
+	// now we must find a second number matching the sequence
+	left_start = f_start;
+	left_end = f_end;
+	left_num = first_num;
+
+	// non-matching number inside?
+	fs_start = -1;
+	find_non_matching_number(&first_l->buf[first_o], first_eow-first_o,
+		&second_l->buf[second_o], second_eow-second_o,
+		&fs_start, &f_end, &s_end);
+	if (fs_start == -1) return 0; // no: cannot merge
+	f_start = first_o+fs_start;
+	f_end += first_o;
+	s_start = second_o+fs_start;
+	s_end += second_o;
+	first_o = first_eow;
+	second_o = second_eow;
+
+	// in sequence? if not, cannot merge
+	second_num = to_i(&second_l->buf[s_start], s_end-s_start);
+	if (first_l->sequence_size) {
+		if (first_l->right_digit_start_o < 0
+		    || second_num != first_l->right_digit_base + first_l->sequence_size + 1)
+			return 0;
+	} else {
+		first_num = to_i(&first_l->buf[f_start], f_end-f_start);
+		if (second_num != first_num + 1)
+			return 0;
+	}
+
+	// find next non-equal word
+	while (1) {
+		next_word(first_l->buf, first_o, &first_o, &first_eow);
+		next_word(second_l->buf, second_o, &second_o, &second_eow);
+		if (first_eow <= first_o && second_eow <= second_o) {
+			// reached end of line
+			if (first_l->sequence_size)
+				first_l->sequence_size++;
+			else {
+				first_l->left_digit_start_o = left_start;
+				first_l->left_digit_end_o = left_end;
+				first_l->left_digit_base = left_num;
+				first_l->right_digit_start_o = f_start;
+				first_l->right_digit_end_o = f_end;
+				first_l->right_digit_base = first_num;
+				first_l->sequence_size = 1;
+			}
+			second_l->buf[0] = 0;
+			return 0;
+		}
+		if (first_eow <= first_o || second_eow <= second_o) return 0;
+		if (first_eow-first_o != second_eow-second_o
+		    || strncmp(&first_l->buf[first_o], &second_l->buf[second_o], first_eow-first_o))
+			break;
+		first_o = first_eow;
+		second_o = second_eow;
+	}
+	// found another non-matching word, cannot merge
+	return 0;
+}
+
+int main(int argc, char** argv)
+{
+	struct line_buf first_line, second_line;
+	FILE* fp = 0;
+	int rc;
+
+	if (argc < 2) {
+		fprintf(stderr,
+			"merge_seq - merge sequence (needs presorted file)\n"
+			"Usage: %s <data_file>\n", argv[0]);
+		goto xout;
+	}
+	fp = fopen(argv[1], "r");
+	if (!fp) {
+		fprintf(stderr, "Error opening %s.\n", argv[1]);
+		goto xout;
+	}
+
+	// read first line
+	first_line.buf[0] = 0;
+	first_line.left_digit_start_o = -1;
+	first_line.right_digit_start_o = -1;
+	first_line.sequence_size = 0;
+	if (!fgets(first_line.buf, sizeof(first_line.buf), fp)
+	    || !first_line.buf[0]) goto out;
+
+	while (1) {
+		// read second line
+		second_line.buf[0] = 0;
+		second_line.left_digit_start_o = -1;
+		second_line.right_digit_start_o = -1;
+		second_line.sequence_size = 0;
+		if (!fgets(second_line.buf, sizeof(second_line.buf), fp))
+			break;
+		// can the two be merged?
+		rc = merge_line(&first_line, &second_line);
+		if (rc) goto xout;
+		if (second_line.buf[0]) {
+			// no: print first line and move second line to first
+			rc = print_line(&first_line);
+			if (rc) goto xout;
+			first_line = second_line;
+		}
+	}
+	rc = print_line(&first_line);
+	if (rc) goto xout;
+out:
+	fclose(fp);
+	return EXIT_SUCCESS;
+xout:
+	return rc;
+}
diff --git a/sort_seq.c b/sort_seq.c
new file mode 100644
index 0000000..d0c24cd
--- /dev/null
+++ b/sort_seq.c
@@ -0,0 +1,147 @@
+//
+// Author: Wolfgang Spraul
+//
+// This is free and unencumbered software released into the public domain.
+// For details see the UNLICENSE file at the root of the source tree.
+//
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+#define LINE_LENGTH	1024
+
+static int s_numlines;
+static char s_lines[1000][LINE_LENGTH];
+
+static int is_known_suffix(const char* str)
+{
+	static const char known_suffix[32][16] =
+		{ "_S0", "_N3", "_INT0", "_INT1", "_INT2", "_INT3",
+		  "_TEST", "_BRK", "_BUF", "_FOLD", "_BRAM", "_BRAM_INTER",
+		  "_CLB", "_DSP", "_INT", "_MCB", "_DOWN", "_UP",
+		  "_E", "_W", "_S", "_N", "_M", "_EXT", "_PINW",
+		  "" };
+	int i;
+
+	if (str[0] != '_') return 0;
+	for (i = 0; known_suffix[i][0]; i++) {
+		if (!strcmp(known_suffix[i], str))
+			return 1;
+	}
+	return 0;
+}
+
+static void copy_word(char* buf, const char* s)
+{
+	int i = 0;
+	while (s[i] != ' ' && s[i] != '\t' && s[i] != '\n' && s[i]) {
+		buf[i] = s[i];
+		i++;
+	}
+	buf[i] = 0;
+}
+
+int sort_lines(const void* a, const void* b)
+{
+	const char* _a, *_b;
+	int i, a_i, b_i, a_num, b_num, rc;
+	char a_word[1024], b_word[1024];
+
+	_a = a;
+	_b = b;
+
+	// search first non-matching character
+	for (i = 0; _a[i] && _a[i] == _b[i]; i++);
+
+	// if entire string matches, return 0
+	if (!_a[i] && !_b[i]) return 0;
+
+	// if neither of the non-matching characters is a digit, return
+	if ((_a[i] < '0' || _a[i] > '9')
+	    && (_b[i] < '0' || _b[i] > '9'))
+		return _a[i] - _b[i];
+
+	// go back to beginning of numeric section
+	// (a and b must be identical going backwards)
+	while (i && _a[i-1] >= '0' && _a[i-1] <= '9')
+		i--;
+
+	// go forward to first non-digit
+	for (a_i = i; _a[a_i] >= '0' && _a[a_i] <= '9'; a_i++ );
+	for (b_i = i; _b[b_i] >= '0' && _b[b_i] <= '9'; b_i++ );
+
+	// there must be at least one digit on each side
+	if (a_i <= i || b_i <= i) {
+		// We move numbers before all other characters.
+		if (_a[i] >= '0' && _a[i] <= '9'
+		    && (_b[i] < '0' || _b[i] > '9')) return 1;
+		if (_b[i] >= '0' && _b[i] <= '9'
+		    && (_a[i] < '0' || _a[i] > '9')) return -1;
+		return _a[i] - _b[i];
+	}
+
+	// for known suffixes, the suffix comes before the number
+	copy_word(a_word, &_a[a_i]);
+	copy_word(b_word, &_b[b_i]);
+	if ((!a_word[0] || is_known_suffix(a_word))
+	    && (!b_word[0] || is_known_suffix(b_word))) {
+		rc = strcmp(a_word, b_word);
+		if (rc) return rc;
+	}
+
+	a_num = strtol(&_a[i], 0 /* endptr */, 10);
+	b_num = strtol(&_b[i], 0 /* endptr */, 10);
+	if (a_num != b_num)
+		return a_num - b_num;
+
+	return strcmp(&_a[a_i], &_b[b_i]);
+}
+
+int main(int argc, char** argv)
+{
+	FILE* fp = 0;
+	int i;
+
+	if (argc < 2) {
+		fprintf(stderr,
+			"sort_seq - sort by sequence\n"
+			"Usage: %s <data_file>\n", argv[0]);
+		goto xout;
+	}
+	fp = fopen(argv[1], "r");
+	if (!fp) {
+		fprintf(stderr, "Error opening %s.\n", argv[1]);
+		goto xout;
+	}
+	s_numlines = 0;
+	// read 200 lines to beginning of buffer
+	while (s_numlines < 200
+	       && fgets(s_lines[s_numlines], sizeof(s_lines[0]), fp))
+		s_numlines++;
+	while (1) {
+		// read another 800 lines
+		while (s_numlines < 1000
+		       && fgets(s_lines[s_numlines], sizeof(s_lines[0]), fp))
+			s_numlines++;
+		if (!s_numlines) break;
+		// sort 1000 lines
+		qsort(s_lines, s_numlines, sizeof(s_lines[0]), sort_lines);
+		// print first 800 lines
+		for (i = 0; i < 800; i++) {
+			if (i >= s_numlines) break;
+			printf(s_lines[i]);
+		}
+		// move up last 200 lines to beginning of buffer
+		if (s_numlines > i) {
+			memmove(s_lines[0], s_lines[i],
+				(s_numlines-i)*sizeof(s_lines[0]));
+			s_numlines -= i;
+		} else
+			s_numlines = 0;
+	}
+	fclose(fp);
+	return EXIT_SUCCESS;
+xout:
+	return EXIT_FAILURE;
+}