ethminer/libserpent/tokenize.cpp

#include <stdio.h>
#include <iostream>
#include <vector>
#include <map>
#include "util.h"

// These appear as independent tokens even if inside a stream of symbols
const std::string atoms[] = { "#", "//", "(", ")", "[", "]", "{", "}" };
const int numAtoms = 8;

// Is the char alphanumeric, a space, a bracket, a quote, a symbol?
int chartype(char c) {
    if (c >= '0' && c <= '9') return ALPHANUM;
    else if (c >= 'a' && c <= 'z') return ALPHANUM;
    else if (c >= 'A' && c <= 'Z') return ALPHANUM;
	else if (std::string("~_$").find(c) != std::string::npos) return ALPHANUM;
    else if (c == '\t' || c == ' ' || c == '\n') return SPACE;
	else if (std::string("()[]{}").find(c) != std::string::npos) return BRACK;
    else if (c == '"') return DQUOTE;
    else if (c == '\'') return SQUOTE;
    else return SYMB;
}

// "y = f(45,124)/3" -> [ "y", "f", "(", "45", ",", "124", ")", "/", "3"]
std::vector<Node> tokenize(std::string inp, Metadata metadata, bool lispMode) {
    int curtype = SPACE;
	unsigned pos = 0;
    int lastNewline = 0;
    metadata.ch = 0;
    std::string cur;
    std::vector<Node> out;

    inp += " ";
    while (pos < inp.length()) {
        int headtype = chartype(inp[pos]);
        if (lispMode) {
            if (inp[pos] == '\'') headtype = ALPHANUM;
        }
        // Are we inside a quote?
        if (curtype == SQUOTE || curtype == DQUOTE) {
            // Close quote
            if (headtype == curtype) {
                cur += inp[pos];
                out.push_back(token(cur, metadata));
                cur = "";
                metadata.ch = pos - lastNewline;
                curtype = SPACE;
                pos += 1;
            }
            // eg. \xc3
            else if (inp.length() >= pos + 4 && inp.substr(pos, 2) == "\\x") {
                cur += (std::string("0123456789abcdef").find(inp[pos+2]) * 16
                        + std::string("0123456789abcdef").find(inp[pos+3]));
                pos += 4;
            }
            // Newline
            else if (inp.substr(pos, 2) == "\\n") {
                cur += '\n';
                pos += 2;
            }
            // Backslash escape
            else if (inp.length() >= pos + 2 && inp[pos] == '\\') {
                cur += inp[pos + 1];
                pos += 2;
            }
            // Normal character
            else {
                cur += inp[pos];
                pos += 1;
            }
        }
        else {
            // Handle atoms ( '//', '#',  brackets )
            for (int i = 0; i < numAtoms; i++) {
                int split = cur.length() - atoms[i].length();
                if (split >= 0 && cur.substr(split) == atoms[i]) {
                    if (split > 0) {
                        out.push_back(token(cur.substr(0, split), metadata));
                    }
                    metadata.ch += split;
                    out.push_back(token(cur.substr(split), metadata));
                    metadata.ch = pos - lastNewline;
                    cur = "";
                    curtype = SPACE;
                }
            }
            // Special case the minus sign
            if (cur.length() > 1 && (cur.substr(cur.length() - 1) == "-"
                                  || cur.substr(cur.length() - 1) == "!")) {
                out.push_back(token(cur.substr(0, cur.length() - 1), metadata));
                out.push_back(token(cur.substr(cur.length() - 1), metadata));
                cur = "";
            }
            // Boundary between different char types
            if (headtype != curtype) {
                if (curtype != SPACE && cur != "") {
                    out.push_back(token(cur, metadata));
                }
                metadata.ch = pos - lastNewline;
                cur = "";
            }
            cur += inp[pos];
            curtype = headtype;
            pos += 1;
        }
        if (inp[pos] == '\n') {
            lastNewline = pos;
            metadata.ch = 0;
            metadata.ln += 1;
        }
    }
    return out;
}
Project refactor. Introduce the Serpent library. 11 years ago			`#include <stdio.h>`
			`#include <iostream>`
			`#include <vector>`
			`#include <map>`
			`#include "util.h"`

			`// These appear as independent tokens even if inside a stream of symbols`
			`const std::string atoms[] = { "#", "//", "(", ")", "[", "]", "{", "}" };`
			`const int numAtoms = 8;`

			`// Is the char alphanumeric, a space, a bracket, a quote, a symbol?`
			`int chartype(char c) {`
			`if (c >= '0' && c <= '9') return ALPHANUM;`
			`else if (c >= 'a' && c <= 'z') return ALPHANUM;`
			`else if (c >= 'A' && c <= 'Z') return ALPHANUM;`
Fix for State and Transaction. Serpent update. 10 years ago			`else if (std::string("~_$").find(c) != std::string::npos) return ALPHANUM;`
Project refactor. Introduce the Serpent library. 11 years ago			`else if (c == '\t' \|\| c == ' ' \|\| c == '\n') return SPACE;`
			`else if (std::string("()[]{}").find(c) != std::string::npos) return BRACK;`
			`else if (c == '"') return DQUOTE;`
			`else if (c == '\'') return SQUOTE;`
			`else return SYMB;`
			`}`

			`// "y = f(45,124)/3" -> [ "y", "f", "(", "45", ",", "124", ")", "/", "3"]`
Sync script for V's serpent impl. Synced with V's serpent. 11 years ago			`std::vector<Node> tokenize(std::string inp, Metadata metadata, bool lispMode) {`
Project refactor. Introduce the Serpent library. 11 years ago			`int curtype = SPACE;`
			`unsigned pos = 0;`
			`int lastNewline = 0;`
			`metadata.ch = 0;`
			`std::string cur;`
			`std::vector<Node> out;`

			`inp += " ";`
			`while (pos < inp.length()) {`
			`int headtype = chartype(inp[pos]);`
Sync script for V's serpent impl. Synced with V's serpent. 11 years ago			`if (lispMode) {`
			`if (inp[pos] == '\'') headtype = ALPHANUM;`
			`}`
Project refactor. Introduce the Serpent library. 11 years ago			`// Are we inside a quote?`
			`if (curtype == SQUOTE \|\| curtype == DQUOTE) {`
			`// Close quote`
			`if (headtype == curtype) {`
			`cur += inp[pos];`
			`out.push_back(token(cur, metadata));`
			`cur = "";`
			`metadata.ch = pos - lastNewline;`
			`curtype = SPACE;`
			`pos += 1;`
			`}`
			`// eg. \xc3`
			`else if (inp.length() >= pos + 4 && inp.substr(pos, 2) == "\\x") {`
			`cur += (std::string("0123456789abcdef").find(inp[pos+2]) * 16`
			`+ std::string("0123456789abcdef").find(inp[pos+3]));`
			`pos += 4;`
			`}`
			`// Newline`
			`else if (inp.substr(pos, 2) == "\\n") {`
			`cur += '\n';`
			`pos += 2;`
			`}`
			`// Backslash escape`
			`else if (inp.length() >= pos + 2 && inp[pos] == '\\') {`
			`cur += inp[pos + 1];`
			`pos += 2;`
			`}`
			`// Normal character`
			`else {`
			`cur += inp[pos];`
			`pos += 1;`
			`}`
			`}`
			`else {`
			`// Handle atoms ( '//', '#', brackets )`
			`for (int i = 0; i < numAtoms; i++) {`
			`int split = cur.length() - atoms[i].length();`
			`if (split >= 0 && cur.substr(split) == atoms[i]) {`
			`if (split > 0) {`
			`out.push_back(token(cur.substr(0, split), metadata));`
			`}`
			`metadata.ch += split;`
			`out.push_back(token(cur.substr(split), metadata));`
			`metadata.ch = pos - lastNewline;`
			`cur = "";`
			`curtype = SPACE;`
			`}`
			`}`
			`// Special case the minus sign`
Reorganisation. 10 years ago			`if (cur.length() > 1 && (cur.substr(cur.length() - 1) == "-"`
			`\|\| cur.substr(cur.length() - 1) == "!")) {`
Project refactor. Introduce the Serpent library. 11 years ago			`out.push_back(token(cur.substr(0, cur.length() - 1), metadata));`
Reorganisation. 10 years ago			`out.push_back(token(cur.substr(cur.length() - 1), metadata));`
Project refactor. Introduce the Serpent library. 11 years ago			`cur = "";`
			`}`
			`// Boundary between different char types`
			`if (headtype != curtype) {`
			`if (curtype != SPACE && cur != "") {`
			`out.push_back(token(cur, metadata));`
			`}`
			`metadata.ch = pos - lastNewline;`
			`cur = "";`
			`}`
			`cur += inp[pos];`
			`curtype = headtype;`
			`pos += 1;`
			`}`
			`if (inp[pos] == '\n') {`
			`lastNewline = pos;`
			`metadata.ch = 0;`
			`metadata.ln += 1;`
			`}`
			`}`
			`return out;`
			`}`