/* Simple program to search for BOLT references in C files and make sure
 * they're accurate. */
#include <ccan/err/err.h>
#include <ccan/opt/opt.h>
#include <ccan/str/str.h>
#include <ccan/tal/grab_file/grab_file.h>
#include <ccan/tal/path/path.h>
#include <ccan/tal/str/str.h>
#include <ccan/tal/tal.h>
#include <sys/types.h>
#include <dirent.h>

static bool verbose = false;

struct bolt_file {
	const char *prefix;
	const char *contents;
};

/* Turn any whitespace into a single space. */
static char *canonicalize(char *str)
{
	char *to = str, *from = str;
	bool have_space = true;

	while (*from) {
		if (cisspace(*from)) {
			if (!have_space)
				*(to++) = ' ';
			have_space = true;
		} else {
			*(to++) = *from;
			have_space = false;
		}
		from++;
	}
	if (have_space && to != str)
		to--;
	*to = '\0';
	tal_resize(&str, to + 1 - str);
	return str;
}

static void get_files(const char *dir, const char *subdir,
		      struct bolt_file **files)
{
	char *path = path_join(NULL, dir, subdir);
	DIR *d = opendir(path);
	size_t n = tal_count(*files);
	struct dirent *e;

	if (!d)
		err(1, "Opening BOLT dir %s", path);

	while ((e = readdir(d)) != NULL) {
		int preflen;

		/* Must end in .md */
		if (!strends(e->d_name, ".md"))
			continue;

		/* Prefix is anything up to - */
		preflen = strspn(e->d_name,
				 "0123456789"
				 "abcdefghijklmnopqrstuvwxyz"
				 "ABCDEFGHIJKLMNOPQRSTUVWXYZ");
		if (!preflen)
			continue;
		if (preflen + strlen(".md") != strlen(e->d_name)
		    && e->d_name[preflen] != '-')
			continue;

		if (verbose)
			printf("Found bolt %.*s\n", preflen, e->d_name);

		tal_resize(files, n+1);
		(*files)[n].prefix = tal_strndup(*files,
						 e->d_name, preflen);
		(*files)[n].contents
			= canonicalize(grab_file(*files,
						 path_join(path, path,
							   e->d_name)));
		n++;
	}
}

static struct bolt_file *get_bolt_files(const char *dir)
{
	struct bolt_file *bolts = tal_arr(NULL, struct bolt_file, 0);

	get_files(dir, "bolts", &bolts);
	get_files(dir, "early-drafts", &bolts);
	return bolts;
}

static char *find_bolt_ref(char **p, size_t *len)
{
	for (;;) {
		char *bolt, *end;
		size_t preflen;

		/* BOLT #X: */
		*p = strstr(*p, "BOLT");
		if (!*p)
			return NULL;
		*p += 4;
		while (cisspace(**p))
			(*p)++;
		if (**p != '#')
			continue;
		(*p)++;

		preflen = strcspn(*p, " :");
		bolt = tal_strndup(NULL, *p, preflen);

		(*p) += preflen;
		while (cisspace(**p))
			(*p)++;
		if (**p != ':')
			continue;
		(*p)++;

		end = strstr(*p, "*/");
		if (!end)
			*len = strlen(*p);
		else
			*len = end - *p;
		return bolt;
	}
}

static char *code_to_regex(const char *code, size_t len, bool escape)
{
	char *pattern = tal_arr(NULL, char, len*2 + 1), *p;
	size_t i;
	bool after_nl = true;

	/* We swallow '*' if first in line: block comments */
	p = pattern;
	for (i = 0; i < len; i++) {
		/* ... matches anything. */
		if (strstarts(code + i, "...")) {
			*(p++) = '.';
			*(p++) = '*';
			i += 2;
			continue;
		}

		switch (code[i]) {
		case '\n':
			after_nl = true;
			*(p++) = code[i];
			break;

		case '*':
			if (after_nl) {
				after_nl = false;
				continue;
			}
			/* Fall thru. */
		case '.':
		case '$':
		case '^':
		case '[':
		case ']':
		case '(':
		case ')':
		case '+':
		case '|':
			if (escape)
				*(p++) = '\\';
			/* Fall thru */
		default:
			*(p++) = code[i];
		}
	}
	*p = '\0';
	return canonicalize(pattern);
}

/* Moves *pos to start of line. */
static unsigned linenum(const char *raw, const char **pos)
{
	unsigned line = 0; /* Out-by-one below */
	const char *l = raw, *point = *pos;

	while (l < point) {
		*pos = l;
		l = strchr(l, '\n');
		line++;
		if (!l)
			break;
		l++;
	}
	return line;
}

static void fail_mismatch(const char *filename,
			  const char *raw, const char *pos,
			  size_t len, struct bolt_file *bolt)
{
	unsigned line = linenum(raw, &pos);
	char *try;

	fprintf(stderr, "%s:%u:mismatch:%.*s\n",
		filename, line, (int)strcspn(pos, "\n"), pos);
	/* Try to find longest match, as a hint. */
	try = code_to_regex(pos + strcspn(pos, "\n"), len, false);
	while (strlen(try)) {
		const char *p = strstr(bolt->contents, try);
		if (p) {
			fprintf(stderr, "Closest match: %s...[%.20s]\n",
				try, p + strlen(try));
			break;
		}
		try[strlen(try)-1] = '\0';
	}
	exit(1);
}

static void fail_nobolt(const char *filename,
			const char *raw, const char *pos,
			const char *bolt_prefix)
{
	unsigned line = linenum(raw, &pos);

	fprintf(stderr, "%s:%u:unknown bolt %s\n",
		filename, line, bolt_prefix);
	exit(1);
}
	
static struct bolt_file *find_bolt(const char *bolt_prefix,
				   struct bolt_file *bolts)
{
	size_t i, n = tal_count(bolts);
	int boltnum;

	for (i = 0; i < n; i++)
		if (streq(bolts[i].prefix, bolt_prefix))
			return bolts+i;

	/* Now search for numerical match. */
	boltnum = atoi(bolt_prefix);
	if (boltnum) {
		for (i = 0; i < n; i++)
			if (atoi(bolts[i].prefix) == boltnum)
				return bolts+i;
	}

	return NULL;
}

int main(int argc, char *argv[])
{
	struct bolt_file *bolts;
	int i;

	err_set_progname(argv[0]);

	opt_register_noarg("--help|-h", opt_usage_and_exit,
			   "<bolt-dir> <srcfile>...\n"
			   "A source checker for BOLT RFC references.",
			   "Print this message.");
	opt_register_noarg("--verbose", opt_set_bool, &verbose,
			   "Print out files as we find them");

	opt_parse(&argc, argv, opt_log_stderr_exit);
	if (argc < 2)
		opt_usage_exit_fail("Expected a bolt directory");

	bolts = get_bolt_files(argv[1]);

	for (i = 2; i < argc; i++) {
		char *f = grab_file(NULL, argv[i]), *p, *bolt;
		size_t len;
		if (!f)
			err(1, "Loading %s", argv[i]);

		if (verbose)
			printf("Checking %s...\n", argv[i]);

		p = f;
		while ((bolt = find_bolt_ref(&p, &len)) != NULL) {
			char *pattern = code_to_regex(p, len, true);
			struct bolt_file *b = find_bolt(bolt, bolts);
			if (!b)
				fail_nobolt(argv[i], f, p, bolt);
			if (!tal_strreg(f, b->contents, pattern, NULL))
				fail_mismatch(argv[i], f, p, len, b);

			if (verbose)
				printf("  Found %.10s... in %s\n",
				       p, b->prefix);
			p += len;
		}
		tal_free(f);
	}
	return 0;
}