ITPub博客

首页 > 数据库 > PostgreSQL > PostgreSQL 源码解读(170)- 查询#90(PG中的词法定义:scanner.l)#3

PostgreSQL 源码解读(170)- 查询#90(PG中的词法定义:scanner.l)#3

原创 PostgreSQL 作者:husthxd 时间:2019-04-17 17:36:19 0 删除 编辑

输入一条SQL语句,PostgreSQL如何解析输入的SQL,识别SQL类型以及基表/字段等信息?接下来的几节将逐一进行解析.
本节介绍了PostgreSQL的词法定义文件(Flex输入文件),在文件src/backend/parser/scan.l中.
如前所述,Flex输入文件由四部分组成:


%{
Declarations
%}
Definitions
%%
Rules
%%
User subroutines

本节介绍第三部分Rules.

一、Rules

在Flex的模式文件中,%%和%%之间的内容被称为规则(rules),每一行表示一条规则,每条规则由匹配模式(pattern)和 动作(action)组成。其中模式在前面,用正则表达式表示,动作在后面,即C代码。每当一个模式被匹配到时,后面的C代码将被执行。
Flex会将规则翻译成名为yylex的函数,该函数扫描输入文件(默认标准输入),当扫描到一个完整的、最长的、可以和某条规则的正则表达式所匹配的输入时,函数会执行此规则后面的C代码。如果代码中没有return语句,则执行完毕后,yylex会继续运行,开始下一轮的扫描和匹配。注意:当有多条规则的模式被匹配到时, yylex会优先选择匹配长度最长的那条规则,如果有匹配长度相等的规则,则选择排在最前面的那条规则。

PG中的规则定义如下:


%%
{whitespace}    {
                    //--------- 空白字符
                    //忽略,不作任何处理
                    /* ignore */
                }
{xcstart}        {
                    //--------- C风格注释
                    /* Set location in case of syntax error in comment */
                    //设置位置,以防注释中的语法错误
                    SET_YYLLOC();
                    //深度
                    yyextra->xcdepth = 0;
                    //进入xc状态
                    BEGIN(xc);
                    /* Put back any characters past slash-star; see above */
                    //把斜杠星后的字符放回去
                    // 注意:"/*"是2个字符,从位置2(偏移从0起算)开始把之后的字符放回去
                    yyless(2);
                }
<xc>{xcstart}    {
                    //遇到下一层的注释,深度+1
                    (yyextra->xcdepth)++;
                    /* Put back any characters past slash-star; see above */
                    //类似的,把之后的字符放回去
                    yyless(2);
                }
<xc>{xcstop}    {
                    //层次≤0,回到INITIAL状态,否则层次减1
                    if (yyextra->xcdepth <= 0)
                        BEGIN(INITIAL);
                    else
                        (yyextra->xcdepth)--;
                }
<xc>{xcinside}    {
                    //注释里面的内容,忽略
                    /* ignore */
                }
<xc>{op_chars}    {
                    //注释里面的内容,忽略
                    /* ignore */
                }
<xc>\*+            {
                    //注释里面的内容,忽略
                    /* ignore */
                }
<xc><<EOF>>        { yyerror("unterminated /* comment"); }//遇到结束符,出错
{xbstart}        {
                    /* Binary bit type.
                     * At some point we should simply pass the string
                     * forward to the parser and label it there.
                     * In the meantime, place a leading "b" on the string
                     * to mark it for the input routine as a binary string.
                     */
                    //--------- 二进制位串
                    //在某些点上,我们应该简单的把字符串向前传递给解析器并标记它
                    //在此期间,设置一个打头的字符"b"以标记该输入为二进制串
                    SET_YYLLOC();
                    BEGIN(xb);
                    startlit();
                    addlitchar('b', yyscanner);
                }
<xb>{quotestop}    |
<xb>{quotefail} {
                    yyless(1);
                    BEGIN(INITIAL);
                    yylval->str = litbufdup(yyscanner);
                    return BCONST;
                }
<xh>{xhinside}    |
<xb>{xbinside}    {
                    addlit(yytext, yyleng, yyscanner);
                }
<xh>{quotecontinue}    |
<xb>{quotecontinue}    {
                    /* ignore */
                }
<xb><<EOF>>        { yyerror("unterminated bit string literal"); }
{xhstart}        {
                    //------------- 十六进制串
                    /* Hexadecimal bit type.
                     * At some point we should simply pass the string
                     * forward to the parser and label it there.
                     * In the meantime, place a leading "x" on the string
                     * to mark it for the input routine as a hex string.
                     */
                    SET_YYLLOC();
                    BEGIN(xh);
                    startlit();
                    addlitchar('x', yyscanner);
                }
<xh>{quotestop}    |
<xh>{quotefail} {
                    yyless(1);
                    BEGIN(INITIAL);
                    yylval->str = litbufdup(yyscanner);
                    return XCONST;
                }
<xh><<EOF>>        { yyerror("unterminated hexadecimal string literal"); }
{xnstart}        {
                    //------------- 国家字符
                    /* National character.
                     * We will pass this along as a normal character string,
                     * but preceded with an internally-generated "NCHAR".
                     */
                    const ScanKeyword *keyword;
                    SET_YYLLOC();
                    yyless(1);    /* eat only 'n' this time */
                    keyword = ScanKeywordLookup("nchar",
                                                yyextra->keywords,
                                                yyextra->num_keywords);
                    if (keyword != NULL)
                    {
                        yylval->keyword = keyword->name;
                        return keyword->value;
                    }
                    else
                    {
                        /* If NCHAR isn't a keyword, just return "n" */
                        yylval->str = pstrdup("n");
                        return IDENT;
                    }
                }
{xqstart}        {
                    yyextra->warn_on_first_escape = true;
                    yyextra->saw_non_ascii = false;
                    SET_YYLLOC();
                    if (yyextra->standard_conforming_strings)
                        BEGIN(xq);
                    else
                        BEGIN(xe);
                    startlit();
                }
{xestart}        {
                    yyextra->warn_on_first_escape = false;
                    yyextra->saw_non_ascii = false;
                    SET_YYLLOC();
                    BEGIN(xe);
                    startlit();
                }
{xusstart}        {
                    SET_YYLLOC();
                    if (!yyextra->standard_conforming_strings)
                        ereport(ERROR,
                                (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                                 errmsg("unsafe use of string constant with Unicode escapes"),
                                 errdetail("String constants with Unicode escapes cannot be used when standard_conforming_strings is off."),
                                 lexer_errposition()));
                    BEGIN(xus);
                    startlit();
                }
<xq,xe>{quotestop}    |
<xq,xe>{quotefail} {
                    yyless(1);
                    BEGIN(INITIAL);
                    /*
                     * check that the data remains valid if it might have been
                     * made invalid by unescaping any chars.
                     */
                    if (yyextra->saw_non_ascii)
                        pg_verifymbstr(yyextra->literalbuf,
                                       yyextra->literallen,
                                       false);
                    yylval->str = litbufdup(yyscanner);
                    return SCONST;
                }
<xus>{quotestop} |
<xus>{quotefail} {
                    /* throw back all but the quote */
                    yyless(1);
                    /* xusend state looks for possible UESCAPE */
                    BEGIN(xusend);
                }
<xusend>{whitespace} {
                    /* stay in xusend state over whitespace */
                }
<xusend><<EOF>> |
<xusend>{other} |
<xusend>{xustop1} {
                    /* no UESCAPE after the quote, throw back everything */
                    yyless(0);
                    BEGIN(INITIAL);
                    yylval->str = litbuf_udeescape('\\', yyscanner);
                    return SCONST;
                }
<xusend>{xustop2} {
                    /* found UESCAPE after the end quote */
                    BEGIN(INITIAL);
                    if (!check_uescapechar(yytext[yyleng - 2]))
                    {
                        SET_YYLLOC();
                        ADVANCE_YYLLOC(yyleng - 2);
                        yyerror("invalid Unicode escape character");
                    }
                    yylval->str = litbuf_udeescape(yytext[yyleng - 2],
                                                   yyscanner);
                    return SCONST;
                }
<xq,xe,xus>{xqdouble} {
                    addlitchar('\'', yyscanner);
                }
<xq,xus>{xqinside}  {
                    addlit(yytext, yyleng, yyscanner);
                }
<xe>{xeinside}  {
                    addlit(yytext, yyleng, yyscanner);
                }
<xe>{xeunicode} {
                    pg_wchar    c = strtoul(yytext + 2, NULL, 16);
                    check_escape_warning(yyscanner);
                    if (is_utf16_surrogate_first(c))
                    {
                        yyextra->utf16_first_part = c;
                        BEGIN(xeu);
                    }
                    else if (is_utf16_surrogate_second(c))
                        yyerror("invalid Unicode surrogate pair");
                    else
                        addunicode(c, yyscanner);
                }
<xeu>{xeunicode} {
                    pg_wchar    c = strtoul(yytext + 2, NULL, 16);
                    if (!is_utf16_surrogate_second(c))
                        yyerror("invalid Unicode surrogate pair");
                    c = surrogate_pair_to_codepoint(yyextra->utf16_first_part, c);
                    addunicode(c, yyscanner);
                    BEGIN(xe);
                }
<xeu>.            { yyerror("invalid Unicode surrogate pair"); }
<xeu>\n            { yyerror("invalid Unicode surrogate pair"); }
<xeu><<EOF>>    { yyerror("invalid Unicode surrogate pair"); }
<xe,xeu>{xeunicodefail}    {
                    ereport(ERROR,
                            (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
                             errmsg("invalid Unicode escape"),
                             errhint("Unicode escapes must be \\uXXXX or \\UXXXXXXXX."),
                             lexer_errposition()));
                }
<xe>{xeescape}  {
                    if (yytext[1] == '\'')
                    {
                        if (yyextra->backslash_quote == BACKSLASH_QUOTE_OFF ||
                            (yyextra->backslash_quote == BACKSLASH_QUOTE_SAFE_ENCODING &&
                             PG_ENCODING_IS_CLIENT_ONLY(pg_get_client_encoding())))
                            ereport(ERROR,
                                    (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
                                     errmsg("unsafe use of \\' in a string literal"),
                                     errhint("Use '' to write quotes in strings. \\' is insecure in client-only encodings."),
                                     lexer_errposition()));
                    }
                    check_string_escape_warning(yytext[1], yyscanner);
                    addlitchar(unescape_single_char(yytext[1], yyscanner),
                               yyscanner);
                }
<xe>{xeoctesc}  {
                    unsigned char c = strtoul(yytext + 1, NULL, 8);
                    check_escape_warning(yyscanner);
                    addlitchar(c, yyscanner);
                    if (c == '\0' || IS_HIGHBIT_SET(c))
                        yyextra->saw_non_ascii = true;
                }
<xe>{xehexesc}  {
                    unsigned char c = strtoul(yytext + 2, NULL, 16);
                    check_escape_warning(yyscanner);
                    addlitchar(c, yyscanner);
                    if (c == '\0' || IS_HIGHBIT_SET(c))
                        yyextra->saw_non_ascii = true;
                }
<xq,xe,xus>{quotecontinue} {
                    /* ignore */
                }
<xe>.            {
                    /* This is only needed for \ just before EOF */
                    addlitchar(yytext[0], yyscanner);
                }
<xq,xe,xus><<EOF>>        { yyerror("unterminated quoted string"); }
{dolqdelim}        {
                    SET_YYLLOC();
                    yyextra->dolqstart = pstrdup(yytext);
                    BEGIN(xdolq);
                    startlit();
                }
{dolqfailed}    {
                    SET_YYLLOC();
                    /* throw back all but the initial "$" */
                    yyless(1);
                    /* and treat it as {other} */
                    return yytext[0];
                }
<xdolq>{dolqdelim} {
                    if (strcmp(yytext, yyextra->dolqstart) == 0)
                    {
                        pfree(yyextra->dolqstart);
                        yyextra->dolqstart = NULL;
                        BEGIN(INITIAL);
                        yylval->str = litbufdup(yyscanner);
                        return SCONST;
                    }
                    else
                    {
                        /*
                         * When we fail to match $...$ to dolqstart, transfer
                         * the $... part to the output, but put back the final
                         * $ for rescanning.  Consider $delim$...$junk$delim$
                         */
                        addlit(yytext, yyleng - 1, yyscanner);
                        yyless(yyleng - 1);
                    }
                }
<xdolq>{dolqinside} {
                    addlit(yytext, yyleng, yyscanner);
                }
<xdolq>{dolqfailed} {
                    addlit(yytext, yyleng, yyscanner);
                }
<xdolq>.        {
                    /* This is only needed for $ inside the quoted text */
                    addlitchar(yytext[0], yyscanner);
                }
<xdolq><<EOF>>    { yyerror("unterminated dollar-quoted string"); }
{xdstart}        {
                    SET_YYLLOC();
                    BEGIN(xd);
                    startlit();
                }
{xuistart}        {
                    SET_YYLLOC();
                    BEGIN(xui);
                    startlit();
                }
<xd>{xdstop}    {
                    char       *ident;
                    BEGIN(INITIAL);
                    if (yyextra->literallen == 0)
                        yyerror("zero-length delimited identifier");
                    ident = litbufdup(yyscanner);
                    if (yyextra->literallen >= NAMEDATALEN)
                        truncate_identifier(ident, yyextra->literallen, true);
                    yylval->str = ident;
                    return IDENT;
                }
<xui>{dquote} {
                    yyless(1);
                    /* xuiend state looks for possible UESCAPE */
                    BEGIN(xuiend);
                }
<xuiend>{whitespace} {
                    /* stay in xuiend state over whitespace */
                }
<xuiend><<EOF>> |
<xuiend>{other} |
<xuiend>{xustop1} {
                    /* no UESCAPE after the quote, throw back everything */
                    char       *ident;
                    int            identlen;
                    yyless(0);
                    BEGIN(INITIAL);
                    if (yyextra->literallen == 0)
                        yyerror("zero-length delimited identifier");
                    ident = litbuf_udeescape('\\', yyscanner);
                    identlen = strlen(ident);
                    if (identlen >= NAMEDATALEN)
                        truncate_identifier(ident, identlen, true);
                    yylval->str = ident;
                    return IDENT;
                }
<xuiend>{xustop2}    {
                    /* found UESCAPE after the end quote */
                    char       *ident;
                    int            identlen;
                    BEGIN(INITIAL);
                    if (yyextra->literallen == 0)
                        yyerror("zero-length delimited identifier");
                    if (!check_uescapechar(yytext[yyleng - 2]))
                    {
                        SET_YYLLOC();
                        ADVANCE_YYLLOC(yyleng - 2);
                        yyerror("invalid Unicode escape character");
                    }
                    ident = litbuf_udeescape(yytext[yyleng - 2], yyscanner);
                    identlen = strlen(ident);
                    if (identlen >= NAMEDATALEN)
                        truncate_identifier(ident, identlen, true);
                    yylval->str = ident;
                    return IDENT;
                }
<xd,xui>{xddouble}    {
                    addlitchar('"', yyscanner);
                }
<xd,xui>{xdinside}    {
                    addlit(yytext, yyleng, yyscanner);
                }
<xd,xui><<EOF>>        { yyerror("unterminated quoted identifier"); }
{xufailed}    {
                    char       *ident;
                    SET_YYLLOC();
                    /* throw back all but the initial u/U */
                    yyless(1);
                    /* and treat it as {identifier} */
                    ident = downcase_truncate_identifier(yytext, yyleng, true);
                    yylval->str = ident;
                    return IDENT;
                }
{typecast}        {
                    SET_YYLLOC();
                    return TYPECAST;
                }
{dot_dot}        {
                    SET_YYLLOC();
                    return DOT_DOT;
                }
{colon_equals}    {
                    SET_YYLLOC();
                    return COLON_EQUALS;
                }
{equals_greater} {
                    SET_YYLLOC();
                    return EQUALS_GREATER;
                }
{less_equals}    {
                    SET_YYLLOC();
                    return LESS_EQUALS;
                }
{greater_equals} {
                    SET_YYLLOC();
                    return GREATER_EQUALS;
                }
{less_greater}    {
                    /* We accept both "<>" and "!=" as meaning NOT_EQUALS */
                    SET_YYLLOC();
                    return NOT_EQUALS;
                }
{not_equals}    {
                    /* We accept both "<>" and "!=" as meaning NOT_EQUALS */
                    SET_YYLLOC();
                    return NOT_EQUALS;
                }
{self}            {
                    SET_YYLLOC();
                    return yytext[0];
                }
{operator}        {
                    /*
                     * Check for embedded slash-star or dash-dash; those
                     * are comment starts, so operator must stop there.
                     * Note that slash-star or dash-dash at the first
                     * character will match a prior rule, not this one.
                     */
                    int            nchars = yyleng;
                    char       *slashstar = strstr(yytext, "/*");
                    char       *dashdash = strstr(yytext, "--");
                    if (slashstar && dashdash)
                    {
                        /* if both appear, take the first one */
                        if (slashstar > dashdash)
                            slashstar = dashdash;
                    }
                    else if (!slashstar)
                        slashstar = dashdash;
                    if (slashstar)
                        nchars = slashstar - yytext;
                    /*
                     * For SQL compatibility, '+' and '-' cannot be the
                     * last char of a multi-char operator unless the operator
                     * contains chars that are not in SQL operators.
                     * The idea is to lex '=-' as two operators, but not
                     * to forbid operator names like '?-' that could not be
                     * sequences of SQL operators.
                     */
                    if (nchars > 1 &&
                        (yytext[nchars - 1] == '+' ||
                         yytext[nchars - 1] == '-'))
                    {
                        int            ic;
                        for (ic = nchars - 2; ic >= 0; ic--)
                        {
                            char c = yytext[ic];
                            if (c == '~' || c == '!' || c == '@' ||
                                c == '#' || c == '^' || c == '&' ||
                                c == '|' || c == '`' || c == '?' ||
                                c == '%')
                                break;
                        }
                        if (ic < 0)
                        {
                            /*
                             * didn't find a qualifying character, so remove
                             * all trailing [+-]
                             */
                            do {
                                nchars--;
                            } while (nchars > 1 &&
                                 (yytext[nchars - 1] == '+' ||
                                  yytext[nchars - 1] == '-'));
                        }
                    }
                    SET_YYLLOC();
                    if (nchars < yyleng)
                    {
                        /* Strip the unwanted chars from the token */
                        yyless(nchars);
                        /*
                         * If what we have left is only one char, and it's
                         * one of the characters matching "self", then
                         * return it as a character token the same way
                         * that the "self" rule would have.
                         */
                        if (nchars == 1 &&
                            strchr(",()[].;:+-*/%^<>=", yytext[0]))
                            return yytext[0];
                        /*
                         * Likewise, if what we have left is two chars, and
                         * those match the tokens ">=", "<=", "=>", "<>" or
                         * "!=", then we must return the appropriate token
                         * rather than the generic Op.
                         */
                        if (nchars == 2)
                        {
                            if (yytext[0] == '=' && yytext[1] == '>')
                                return EQUALS_GREATER;
                            if (yytext[0] == '>' && yytext[1] == '=')
                                return GREATER_EQUALS;
                            if (yytext[0] == '<' && yytext[1] == '=')
                                return LESS_EQUALS;
                            if (yytext[0] == '<' && yytext[1] == '>')
                                return NOT_EQUALS;
                            if (yytext[0] == '!' && yytext[1] == '=')
                                return NOT_EQUALS;
                        }
                    }
                    /*
                     * Complain if operator is too long.  Unlike the case
                     * for identifiers, we make this an error not a notice-
                     * and-truncate, because the odds are we are looking at
                     * a syntactic mistake anyway.
                     */
                    if (nchars >= NAMEDATALEN)
                        yyerror("operator too long");
                    yylval->str = pstrdup(yytext);
                    return Op;
                }
{param}            {
                    SET_YYLLOC();
                    yylval->ival = atol(yytext + 1);
                    return PARAM;
                }
{integer}        {
                    SET_YYLLOC();
                    return process_integer_literal(yytext, yylval);
                }
{decimal}        {
                    SET_YYLLOC();
                    yylval->str = pstrdup(yytext);
                    return FCONST;
                }
{decimalfail}    {
                    /* throw back the .., and treat as integer */
                    yyless(yyleng - 2);
                    SET_YYLLOC();
                    return process_integer_literal(yytext, yylval);
                }
{real}            {
                    SET_YYLLOC();
                    yylval->str = pstrdup(yytext);
                    return FCONST;
                }
{realfail1}        {
                    /*
                     * throw back the [Ee], and treat as {decimal}.  Note
                     * that it is possible the input is actually {integer},
                     * but since this case will almost certainly lead to a
                     * syntax error anyway, we don't bother to distinguish.
                     */
                    yyless(yyleng - 1);
                    SET_YYLLOC();
                    yylval->str = pstrdup(yytext);
                    return FCONST;
                }
{realfail2}        {
                    /* throw back the [Ee][+-], and proceed as above */
                    yyless(yyleng - 2);
                    SET_YYLLOC();
                    yylval->str = pstrdup(yytext);
                    return FCONST;
                }
{identifier}    {
                    //---------- 标识符
                    const ScanKeyword *keyword;
                    char       *ident;
                    SET_YYLLOC();
                    /* Is it a keyword? */
                    //是否关键字?
                    keyword = ScanKeywordLookup(yytext,
                                                yyextra->keywords,
                                                yyextra->num_keywords);
                    if (keyword != NULL)
                    {
                        //是,则返回关键字值
                        yylval->keyword = keyword->name;
                        return keyword->value;
                    }
                    /*
                     * No.  Convert the identifier to lower case, and truncate
                     * if necessary.
                     */
                    //如果不是关键字,则设置为小写字母,如需要则截断
                    ident = downcase_truncate_identifier(yytext, yyleng, true);
                    yylval->str = ident;
                    return IDENT;
                }
{other}            {
                    SET_YYLLOC();
                    return yytext[0];
                }
<<EOF>>            {
                    SET_YYLLOC();
                    yyterminate();
                }
%%

二、参考资料

Flex&Bison

来自 “ ITPUB博客 ” ,链接:http://blog.itpub.net/6906/viewspace-2641703/,如需转载,请注明出处,否则将追究法律责任。

请登录后发表评论 登录
全部评论
长期从事政务、金融等行业产品研发和架构设计工作,ITPUB数据库版块资深版主,对Oracle、PostgreSQL以及大数据等相关技术有深入研究。现就职于广州云图数据技术有限公司,系统架构师。

注册时间:2007-12-28

  • 博文量
    1250
  • 访问量
    3724315