1 files changed, 254 insertions, 0 deletions
diff --git a/SQLiteStudio3/coreSQLiteStudio/parser/lexer.h b/SQLiteStudio3/coreSQLiteStudio/parser/lexer.h
new file mode 100644
index 0000000..b21639e
--- /dev/null
+++ b/SQLiteStudio3/coreSQLiteStudio/parser/lexer.h
@@ -0,0 +1,254 @@
+#ifndef LEXER_H
+#define LEXER_H
+
+#include "token.h"
+#include "dialect.h"
+
+#include <QList>
+#include <QString>
+#include <QSet>
+
+/**
+ * @brief Lexer for SQLite gramma.
+ *
+ * Lexer (aka tokenizer) splits SQL string into tokens.
+ * Tokens can be then used to syntax analysis, or for other purposes.
+ *
+ * It is useful if you have to modify some entities in the query,
+ * such as string, or object name, but you don't want to deal with
+ * all escape characters in the name, or other special characters.
+ * Lexer packs such entiries into separate tokens and gives them
+ * type, so you know what is the token representing.
+ */
+class API_EXPORT Lexer
+{
+    public:
+        /**
+         * @brief Creates lexer for given dialect.
+         * @param dialect SQLite dialect.
+         */
+        Lexer(Dialect dialect);
+
+        /**
+         * @brief Releases resources.
+         */
+        virtual ~Lexer();
+
+        /**
+         * @brief Tokenizes (splits into tokens) given SQL query.
+         * @param sql SQL query to tokenize.
+         * @return List of tokens produced from tokenizing query.
+         */
+        TokenList tokenize(const QString& sql);
+
+        /**
+         * @brief Stores given SQL query internally for further processing by the lexer.
+         * @param sql Query to remember.
+         *
+         * This method should be followed by calls to getToken().
+         */
+        void prepare(const QString& sql);
+
+        /**
+         * @brief Gets next token from query defined with prepare().
+         * @return Token read from the query, or null token if no more tokens are available.
+         *
+         * Each call to this method generates token for next part of the query, not tokenized yet.
+         * Usual flow for this method looks like this:
+         * @code
+         * QString query = "...";
+         * TokenPtr token;
+         * lexer.prepare(query);
+         * while (token = lexer.getToken())
+         * {
+         *     // do stuff with the token
+         * }
+         * @endcode
+         */
+        TokenPtr getToken();
+
+        /**
+         * @brief Clears query stored with prepare().
+         */
+        void cleanUp();
+
+        /**
+         * @brief Enables or disabled tolerant mode.
+         * @param enabled If true, then all multi-line and unfinished tokens (strings, comments) will be reported
+         * with invalid=true in TolerantToken, but the token itself will have type like it was finished.
+         */
+        void setTolerantMode(bool enabled);
+
+        /**
+         * @brief Provides static sample tokens of all possible types.
+         * @return All possible token types.
+         * This method uses static set of tokens, so there's no need
+         * to delete them outside.
+         *
+         * It's used by Parser to try every token type as a possible candidate for a next valid token.
+         * You should not need to use this method.
+         */
+        QSet<TokenPtr> getEveryTokenType();
+
+        /**
+         * @brief Gets static sample tokens of given types.
+         * @param types List of token types to get tokens for. Last element in the list must be Token::INVALID.
+         *
+         * It's used by Parser to try every token type as a possible candidate for a next valid token.
+         * You should not need to use this method.
+         *
+         * @overload
+         */
+        QSet<TokenPtr> getEveryTokenType(QSet<Token::Type> types);
+
+        /**
+         * @brief Tests whether lexer finished reading all tokens from the query.
+         * @return true if there is no more tokens to be read, or false otherwise.
+         *
+         * This method simply checks whether there's any characters in the query to be tokenized.
+         * The query is the one defined with prepare(). Query shrinks with very call to getToken()
+         * and once there's no more characters to consume by getToken(), this method will return false.
+         *
+         * If you call getToken() after isEnd() returned false, the getToken() will return Token::INVALID token.
+         */
+        bool isEnd() const;
+
+        /**
+         * @brief Initializes internal set of static tokens.
+         * Initializes internal set of tokens used by getEveryTokenType().
+         */
+        static void staticInit();
+
+        /**
+         * @brief Restores string from token list.
+         * @param tokens List of tokens.
+         * @return String that was represented by tokens.
+         *
+         * It simply joins values of all tokens from the list using empty string separator (that is no separator at all).
+         */
+        static QString detokenize(const TokenList& tokens);
+
+        /**
+         * @brief Tokenizes given SQL query with given dialect.
+         * @param sql SQL query to tokenize.
+         * @param dialect SQLite dialect to use when tokenizing.
+         * @return List of tokens from tokenizing.
+         *
+         * This method is a shortcut for:
+         * @code
+         * Lexer lexer(dialect);
+         * lexer.tokenize(sql);
+         * @endcode
+         */
+        static TokenList tokenize(const QString& sql, Dialect dialect);
+
+        /**
+         * @brief Translates token pointer into common token shared pointer.
+         * @param token Token pointer to translate.
+         * @return Shared pointer if found, or null pointer if not found.
+         *
+         * This method should be used against token pointers extracted from getEveryTokenType() results.
+         * Then pointer from any TokenPtr (returned from getEveryTokenType()) is extracted using the
+         * QSharedPointer::data(), then this method can be used to return back to the QSharedPointer.
+         *
+         * As Lexer keeps static internal list of tokens representing token types,
+         * it can translate token pointer into shared pointer by comparing them.
+         *
+         * This method and getEveryTokenType() methods are used strictly by Parser and you should not
+         * need to use them.
+         */
+        static TokenPtr getEveryTokenTypePtr(Token* token);
+
+        /**
+         * @brief Provides token representing semicolon in given SQLite dialect.
+         * @param dialect Dialect to use.
+         * @return Token representing semicolon.
+         *
+         * This is used by Parser to complete the parsed query in case the input query did not end with semicolon.
+         * Given the \p dialect it provides proper token for that dialect (they are different by Lemon token ID).
+         */
+        static TokenPtr getSemicolonToken(Dialect dialect);
+
+    private:
+        /**
+         * @brief Creates token for every token type internal tables.
+         * @param dialect SQLite dialect to create token for.
+         * @param lemonType Lemon token ID for this token type.
+         * @param type SQLiteStudio token type.
+         * @param value Sample value for the token.
+         * @return Created token.
+         *
+         * Every token type internal tables are populated using this method.
+         *
+         * @see getEveryTokenType()
+         */
+        static TokenPtr createTokenType(Dialect dialect, int lemonType, Token::Type type, const QString& value);
+
+        /**
+         * @brief Current "tolerant mode" flag.
+         *
+         * @see setTolerantMode()
+         */
+        bool tolerant = false;
+
+        /**
+         * @brief Lexer's SQLite dialect.
+         */
+        Dialect dialect;
+
+        /**
+         * @brief SQL query to be tokenized with getToken().
+         *
+         * It's defined with prepare().
+         */
+        QString sqlToTokenize;
+
+        /**
+         * @brief Current tokenizer position in the sqlToTokenize.
+         *
+         * This position index is used to track which SQL characters should be tokenized
+         * on next call to getToken().
+         *
+         * It's reset to 0 by prepare() and cleanUp().
+         */
+        quint64 tokenPosition;
+
+        /**
+         * @brief Internal table of every token type for SQLite 2.
+         *
+         * @see semicolonTokenSqlite3
+         */
+        static TokenPtr semicolonTokenSqlite2;
+
+        /**
+         * @brief Internal table of every token type for SQLite 3.
+         *
+         * Internal token type table contains single token per token type, so it can be used to probe the Parser
+         * for next valid token candidates.
+         */
+        static TokenPtr semicolonTokenSqlite3;
+
+        /**
+         * @brief Internal table of every token type for SQLite 2.
+         *
+         * @see everyTokenType3
+         */
+        static QHash<Token::Type,QSet<TokenPtr> > everyTokenType2;
+
+        /**
+         * @brief Internal table of every token type for SQLite 3.
+         *
+         * Set of tokens representing all token types, including diversification by values for keywords and operators.
+         * It's used by the Parser to probe candidates for next valid token.
+         */
+        static QHash<Token::Type,QSet<TokenPtr> > everyTokenType3;
+
+        /**
+         * @brief Map of every token type pointer to its QSharedPointer from internal tables.
+         *
+         * This is used by getEveryTokenTypePtr().
+         */
+        static QHash<Token*,TokenPtr> everyTokenTypePtrMap;
+};
+
+#endif // LEXER_H