116 lines
3.9 KiB

#include <stdio.h>
#include <iostream>
#include <vector>
#include <map>
#include "util.h"
// These appear as independent tokens even if inside a stream of symbols
const std::string atoms[] = { "#", "//", "(", ")", "[", "]", "{", "}" };
const int numAtoms = 8;
// Is the char alphanumeric, a space, a bracket, a quote, a symbol?
int chartype(char c) {
if (c >= '0' && c <= '9') return ALPHANUM;
else if (c >= 'a' && c <= 'z') return ALPHANUM;
else if (c >= 'A' && c <= 'Z') return ALPHANUM;
else if (std::string("~._$").find(c) != std::string::npos) return ALPHANUM;
else if (c == '\t' || c == ' ' || c == '\n') return SPACE;
else if (std::string("()[]{}").find(c) != std::string::npos) return BRACK;
else if (c == '"') return DQUOTE;
else if (c == '\'') return SQUOTE;
else return SYMB;
}
// "y = f(45,124)/3" -> [ "y", "f", "(", "45", ",", "124", ")", "/", "3"]
std::vector<Node> tokenize(std::string inp, Metadata metadata, bool lispMode) {
int curtype = SPACE;
unsigned pos = 0;
int lastNewline = 0;
metadata.ch = 0;
std::string cur;
std::vector<Node> out;
inp += " ";
while (pos < inp.length()) {
int headtype = chartype(inp[pos]);
if (lispMode) {
if (inp[pos] == '\'') headtype = ALPHANUM;
}
// Are we inside a quote?
if (curtype == SQUOTE || curtype == DQUOTE) {
// Close quote
if (headtype == curtype) {
cur += inp[pos];
out.push_back(token(cur, metadata));
cur = "";
metadata.ch = pos - lastNewline;
curtype = SPACE;
pos += 1;
}
// eg. \xc3
else if (inp.length() >= pos + 4 && inp.substr(pos, 2) == "\\x") {
cur += (std::string("0123456789abcdef").find(inp[pos+2]) * 16
+ std::string("0123456789abcdef").find(inp[pos+3]));
pos += 4;
}
// Newline
else if (inp.substr(pos, 2) == "\\n") {
cur += '\n';
pos += 2;
}
// Backslash escape
else if (inp.length() >= pos + 2 && inp[pos] == '\\') {
cur += inp[pos + 1];
pos += 2;
}
// Normal character
else {
cur += inp[pos];
pos += 1;
}
}
else {
// Handle atoms ( '//', '#', brackets )
for (int i = 0; i < numAtoms; i++) {
int split = cur.length() - atoms[i].length();
if (split >= 0 && cur.substr(split) == atoms[i]) {
if (split > 0) {
out.push_back(token(cur.substr(0, split), metadata));
}
metadata.ch += split;
out.push_back(token(cur.substr(split), metadata));
metadata.ch = pos - lastNewline;
cur = "";
curtype = SPACE;
}
}
// Special case the minus sign
11 years ago
if (cur.length() > 1 && (cur.substr(cur.length() - 1) == "-"
|| cur.substr(cur.length() - 1) == "!")) {
out.push_back(token(cur.substr(0, cur.length() - 1), metadata));
11 years ago
out.push_back(token(cur.substr(cur.length() - 1), metadata));
cur = "";
}
// Boundary between different char types
if (headtype != curtype) {
if (curtype != SPACE && cur != "") {
out.push_back(token(cur, metadata));
}
metadata.ch = pos - lastNewline;
cur = "";
}
cur += inp[pos];
curtype = headtype;
pos += 1;
}
if (inp[pos] == '\n') {
lastNewline = pos;
metadata.ch = 0;
metadata.ln += 1;
}
}
return out;
}