Upgrade http-parser

15 years ago · 9be6c501ec
6 changed files with 242 additions and 140 deletions
--- a/deps/http_parser/Makefile
+++ b/deps/http_parser/Makefile
@ -8,10 +8,10 @@ test: test_g
 test_g: http_parser_g.o test_g.o
 	gcc $(OPT_DEBUG) http_parser_g.o test_g.o -o $@

-test_g.o: test.c Makefile
+test_g.o: test.c http_parser.h Makefile
 	gcc $(OPT_DEBUG) -c test.c -o $@

-test.o: test.c Makefile
+test.o: test.c http_parser.h Makefile
 	gcc $(OPT_FAST) -c test.c -o $@

 http_parser_g.o: http_parser.c http_parser.h Makefile
@ -23,7 +23,7 @@ test-valgrind: test_g
 http_parser.o: http_parser.c http_parser.h Makefile
 	gcc $(OPT_FAST) -c http_parser.c

-test_fast: http_parser.o test.c
+test_fast: http_parser.o test.c http_parser.h
 	gcc $(OPT_FAST) http_parser.o test.c -o $@

 test-run-timed: test_fast
--- a/deps/http_parser/README.md
+++ b/deps/http_parser/README.md
@ -1,30 +1,32 @@
 HTTP Parser
 ===========

-This is a parser for HTTP messages written in C. It parses both requests
-and responses. The parser is designed to be used in performance HTTP
-applications. It does not make any allocations, it does not buffer data, and
-it can be interrupted at anytime. Depending on your architecture, it only
-requires between 100 and 200 bytes of data per message stream (in a web
-server that is per connection).
+This is a parser for HTTP messages written in C. It parses both requests and
+responses. The parser is designed to be used in performance HTTP
+applications. It does not make any syscalls nor allocations, it does not
+buffer data, it can be interrupted at anytime. Depending on your
+architecture, it only requires between 100 and 200 bytes of data per message
+stream (in a web server that is per connection).

 Features:

  * No dependencies
-  * Parses both requests and responses.
-  * Handles persistent streams.
+  * Handles persistent streams (keep-alive).
  * Decodes chunked encoding.
-  * Extracts the following data from a message
-    * header fields and values
-    * content-length
-    * request method
-    * response status code
-    * transfer-encoding
-    * http version
-    * request path, query string, fragment
-    * message body
-  * Defends against buffer overflow attacks.
  * Upgrade support
+  * Defends against buffer overflow attacks.
+
+The parser extracts the following information from HTTP messages:
+
+  * Header fields and values
+  * Content-Length
+  * Request method
+  * Response status code
+  * Transfer-Encoding
+  * HTTP version
+  * Request path, query string, fragment
+  * Message body
+

 Usage
 -----
@ -55,10 +57,9 @@ When data is received on the socket execute the parser and check for errors.
    }

    /* Start up / continue the parser.
-     * Note we pass the recved==0 to http_parse_requests to signal
-     * that EOF has been recieved.
+     * Note we pass recved==0 to signal that EOF has been recieved.
     */
-    nparsed = http_parser_execute(parser, settings, buf, recved);
+    nparsed = http_parser_execute(parser, &settings, buf, recved);

    if (parser->upgrade) {
      /* handle new protocol */
@ -83,10 +84,6 @@ The parser decodes the transfer-encoding for both requests and responses
 transparently. That is, a chunked encoding is decoded before being sent to
 the on_body callback.

-It does not decode the content-encoding (gzip). Not all HTTP applications
-need to inspect the body. Decoding gzip is non-neglagable amount of
-processing (and requires making allocations). HTTP proxies using this
-parser, for example, would not want such a feature.

 The Special Problem of Upgrade
 ------------------------------
@ -109,11 +106,11 @@ information the Web Socket protocol.)

 To support this, the parser will treat this as a normal HTTP message without a
 body. Issuing both on_headers_complete and on_message_complete callbacks. However
-http_parser_execute() may finish without parsing the entire supplied buffer.
+http_parser_execute() will stop parsing at the end of the headers and return.

-The user needs to check if parser->upgrade has been set to 1 after
-http_parser_execute() returns to determine if a premature exit was due to an
-upgrade or an error.
+The user is expected to check if `parser->upgrade` has been set to 1 after
+`http_parser_execute()` returns. Non-HTTP data begins at the buffer supplied
+offset by the return value of `http_parser_execute()`.


 Callbacks
@ -166,6 +163,7 @@ and apply following logic:
    |                        |            | and append callback data to it             |
     ------------------------ ------------ --------------------------------------------

+
 See examples of reading in headers:

 * [partial example](http://gist.github.com/155877) in C
--- a/deps/http_parser/http_parser.c
+++ b/deps/http_parser/http_parser.c
@ -32,9 +32,6 @@
 #endif


-#define MAX_FIELD_SIZE (80*1024)
-
-
 #define CALLBACK2(FOR)                                               \
 do {                                                                 \
  if (settings->on_##FOR) {                                          \
@ -45,20 +42,16 @@ do {                                                                 \

 #define MARK(FOR)                                                    \
 do {                                                                 \
-  parser->FOR##_mark = p;                                            \
-  parser->FOR##_size = 0;                                            \
+  FOR##_mark = p;                                                    \
 } while (0)

-
 #define CALLBACK_NOCLEAR(FOR)                                        \
 do {                                                                 \
-  if (parser->FOR##_mark) {                                          \
-    parser->FOR##_size += p - parser->FOR##_mark;                    \
-    if (parser->FOR##_size > MAX_FIELD_SIZE) return (p - data);      \
+  if (FOR##_mark) {                                                  \
    if (settings->on_##FOR) {                                        \
      if (0 != settings->on_##FOR(parser,                            \
-                                 parser->FOR##_mark,                 \
-                                 p - parser->FOR##_mark))            \
+                                 FOR##_mark,                         \
+                                 p - FOR##_mark))                    \
      {                                                              \
        return (p - data);                                           \
      }                                                              \
@ -70,7 +63,7 @@ do {                                                                 \
 #define CALLBACK(FOR)                                                \
 do {                                                                 \
  CALLBACK_NOCLEAR(FOR);                                             \
-  parser->FOR##_mark = NULL;                                         \
+  FOR##_mark = NULL;                                                 \
 } while (0)


@ -132,6 +125,8 @@ static const uint32_t  usual[] = {
 enum state
  { s_dead = 1 /* important that this is > 0 */

+  , s_start_res_or_resp
+  , s_res_or_resp_H
  , s_start_res
  , s_res_H
  , s_res_HT
@ -303,12 +298,31 @@ size_t http_parser_execute (http_parser *parser,
    return 0;
  }

-  if (parser->header_field_mark)   parser->header_field_mark   = data;
-  if (parser->header_value_mark)   parser->header_value_mark   = data;
-  if (parser->fragment_mark)       parser->fragment_mark       = data;
-  if (parser->query_string_mark)   parser->query_string_mark   = data;
-  if (parser->path_mark)           parser->path_mark           = data;
-  if (parser->url_mark)            parser->url_mark            = data;
+  /* technically we could combine all of these (except for url_mark) into one
+     variable, saving stack space, but it seems more clear to have them
+     separated. */
+  const char *header_field_mark = 0;
+  const char *header_value_mark = 0;
+  const char *fragment_mark = 0;
+  const char *query_string_mark = 0;
+  const char *path_mark = 0;
+  const char *url_mark = 0;
+
+  if (state == s_header_field)
+    header_field_mark = data;
+  if (state == s_header_value)
+    header_value_mark = data;
+  if (state == s_req_fragment)
+    fragment_mark = data;
+  if (state == s_req_query_string)
+    query_string_mark = data;
+  if (state == s_req_path)
+    path_mark = data;
+  if (state == s_req_path || state == s_req_schema || state == s_req_schema_slash
+      || state == s_req_schema_slash_slash || state == s_req_port
+      || state == s_req_query_string_start || state == s_req_query_string
+      || state == s_req_fragment_start || state == s_req_fragment)
+    url_mark = data;

  for (p=data, pe=data+len; p != pe; p++) {
    ch = *p;
@ -326,6 +340,42 @@ size_t http_parser_execute (http_parser *parser,
         */
        goto error;

+      case s_start_res_or_resp:
+      {
+        if (ch == CR || ch == LF)
+          break;
+        parser->flags = 0;
+        parser->content_length = -1;
+
+        CALLBACK2(message_begin);
+
+        if (ch == 'H')
+          state = s_res_or_resp_H;
+        else {
+          parser->type = HTTP_REQUEST;
+          if (ch < 'A' || 'Z' < ch) goto error;
+          parser->buffer[0] = ch;
+          index = 0;
+          state = s_req_method;
+        }
+        break;
+      }
+
+      case s_res_or_resp_H:
+        if (ch == 'T') {
+          parser->type = HTTP_RESPONSE;
+          state = s_res_HT;
+        } else {
+          if (ch < 'A' || 'Z' < ch) goto error;
+          parser->type = HTTP_REQUEST;
+          parser->method = (enum http_method) 0;
+          parser->buffer[0] = 'H';
+          parser->buffer[1] = ch;
+          index = 1;
+          state = s_req_method;
+        }
+        break;
+
      case s_start_res:
      {
        parser->flags = 0;
@ -637,6 +687,9 @@ size_t http_parser_execute (http_parser *parser,
        if (ch == ':') {
          state = s_req_schema_slash;
          break;
+        } else if (ch == '.') {
+          state = s_req_host;
+          break;
        }

        goto error;
@ -1156,12 +1209,14 @@ size_t http_parser_execute (http_parser *parser,

        if (!c) {
          if (ch == CR) {
+            CALLBACK(header_value);
            header_state = h_general;
            state = s_header_almost_done;
            break;
          }

          if (ch == LF) {
+            CALLBACK(header_value);
            state = s_header_field_start;
            break;
          }
@ -1547,15 +1602,8 @@ void
 http_parser_init (http_parser *parser, enum http_parser_type t)
 {
  parser->type = t;
-  parser->state = (t == HTTP_REQUEST ? s_start_req : s_start_res);
+  parser->state = (t == HTTP_REQUEST ? s_start_req : (t == HTTP_RESPONSE ? s_start_res : s_start_res_or_resp));
  parser->nread = 0;
  parser->upgrade = 0;
-
-  parser->header_field_mark = NULL;
-  parser->header_value_mark = NULL;
-  parser->query_string_mark = NULL;
-  parser->path_mark = NULL;
-  parser->url_mark = NULL;
-  parser->fragment_mark = NULL;
 }

--- a/deps/http_parser/http_parser.h
+++ b/deps/http_parser/http_parser.h
@ -89,22 +89,15 @@ enum http_method
  };


-enum http_parser_type { HTTP_REQUEST, HTTP_RESPONSE };
+enum http_parser_type { HTTP_REQUEST, HTTP_RESPONSE, HTTP_BOTH };


 struct http_parser {
  /** PRIVATE **/
-  enum http_parser_type type;
-  unsigned short state;
-  unsigned short header_state;
-  size_t index;
-
-  /* 1 = Upgrade header was present and the parser has exited because of that.
-   * 0 = No upgrade header present.
-   * Should be checked when http_parser_execute() returns in addition to
-   * error checking.
-   */
-  unsigned short upgrade;
+  unsigned char type;
+  unsigned char state;
+  unsigned char header_state;
+  unsigned char index;

  char flags;

@ -112,26 +105,20 @@ struct http_parser {
  ssize_t body_read;
  ssize_t content_length;

-  const char *header_field_mark;
-  size_t      header_field_size;
-  const char *header_value_mark;
-  size_t      header_value_size;
-  const char *query_string_mark;
-  size_t      query_string_size;
-  const char *path_mark;
-  size_t      path_size;
-  const char *url_mark;
-  size_t      url_size;
-  const char *fragment_mark;
-  size_t      fragment_size;
-
  /** READ-ONLY **/
  unsigned short status_code; /* responses only */
-  enum http_method method;    /* requests only */
+  unsigned short method;    /* requests only */
  unsigned short http_major;
  unsigned short http_minor;
  char buffer[HTTP_PARSER_MAX_METHOD_LEN];

+  /* 1 = Upgrade header was present and the parser has exited because of that.
+   * 0 = No upgrade header present.
+   * Should be checked when http_parser_execute() returns in addition to
+   * error checking.
+   */
+  char upgrade;
+
  /** PUBLIC **/
  void *data; /* A pointer to get hook to the "connection" or "socket" object */
 };
--- a/deps/http_parser/test.c
+++ b/deps/http_parser/test.c
@ -495,6 +495,30 @@ const struct message requests[] =
  ,.body= ""
  }

+#define CONNECT_REQUEST 17
+, {.name = "connect request"
+  ,.type= HTTP_REQUEST
+  ,.raw= "CONNECT home.netscape.com:443 HTTP/1.0\r\n"
+         "User-agent: Mozilla/1.1N\r\n"
+         "Proxy-authorization: basic aGVsbG86d29ybGQ=\r\n"
+         "\r\n"
+  ,.should_keep_alive= FALSE
+  ,.message_complete_on_eof= FALSE
+  ,.http_major= 1
+  ,.http_minor= 0
+  ,.method= HTTP_CONNECT
+  ,.query_string= ""
+  ,.fragment= ""
+  ,.request_path= ""
+  ,.request_url= "home.netscape.com:443"
+  ,.num_headers= 2
+  ,.upgrade=0
+  ,.headers= { { "User-agent", "Mozilla/1.1N" }
+             , { "Proxy-authorization", "basic aGVsbG86d29ybGQ=" }
+             }
+  ,.body= ""
+  }
+
 , {.name= NULL } /* sentinel */
 };

@ -721,6 +745,43 @@ const struct message responses[] =
  ,.body= ""
  }

+#define BONJOUR_MADAME_FR 8
+/* The client should not merge two headers fields when the first one doesn't
+ * have a value.
+ */
+, {.name= "bonjourmadame.fr"
+  ,.type= HTTP_RESPONSE
+  ,.raw= "HTTP/1.0 301 Moved Permanently\r\n"
+         "Date: Thu, 03 Jun 2010 09:56:32 GMT\r\n"
+         "Server: Apache/2.2.3 (Red Hat)\r\n"
+         "Cache-Control: public\r\n"
+         "Pragma: \r\n"
+         "Location: http://www.bonjourmadame.fr/\r\n"
+         "Vary: Accept-Encoding\r\n"
+         "Content-Length: 0\r\n"
+         "Content-Type: text/html; charset=UTF-8\r\n"
+         "Connection: keep-alive\r\n"
+         "\r\n"
+  ,.should_keep_alive= TRUE
+  ,.message_complete_on_eof= FALSE
+  ,.http_major= 1
+  ,.http_minor= 0
+  ,.status_code= 301
+  ,.num_headers= 9
+  ,.headers=
+    { { "Date", "Thu, 03 Jun 2010 09:56:32 GMT" }
+    , { "Server", "Apache/2.2.3 (Red Hat)" }
+    , { "Cache-Control", "public" }
+    , { "Pragma", "" }
+    , { "Location", "http://www.bonjourmadame.fr/" }
+    , { "Vary",  "Accept-Encoding" }
+    , { "Content-Length", "0" }
+    , { "Content-Type", "text/html; charset=UTF-8" }
+    , { "Connection", "keep-alive" }
+    }
+  ,.body= ""
+  }
+
 , {.name= NULL } /* sentinel */
 };

@ -1207,82 +1268,84 @@ test_scan (const struct message *r1, const struct message *r2, const struct mess

  int total_len = strlen(total);

-  int total_ops = (total_len - 1) * (total_len - 2) / 2;
+  int total_ops = 2 * (total_len - 1) * (total_len - 2) / 2;
  int ops = 0 ;

  size_t buf1_len, buf2_len, buf3_len;

-  int i,j;
-  for (j = 2; j < total_len; j ++ ) {
-    for (i = 1; i < j; i ++ ) {
+  int i,j,type_both;
+  for (type_both = 0; type_both < 2; type_both ++ ) {
+    for (j = 2; j < total_len; j ++ ) {
+      for (i = 1; i < j; i ++ ) {

-      if (ops % 1000 == 0)  {
-        printf("\b\b\b\b%3.0f%%", 100 * (float)ops /(float)total_ops);
-        fflush(stdout);
-      }
-      ops += 1;
+        if (ops % 1000 == 0)  {
+          printf("\b\b\b\b%3.0f%%", 100 * (float)ops /(float)total_ops);
+          fflush(stdout);
+        }
+        ops += 1;

-      parser_init(r1->type);
+        parser_init(type_both ? HTTP_BOTH : r1->type);

-      buf1_len = i;
-      strncpy(buf1, total, buf1_len);
-      buf1[buf1_len] = 0;
+        buf1_len = i;
+        strncpy(buf1, total, buf1_len);
+        buf1[buf1_len] = 0;

-      buf2_len = j - i;
-      strncpy(buf2, total+i, buf2_len);
-      buf2[buf2_len] = 0;
+        buf2_len = j - i;
+        strncpy(buf2, total+i, buf2_len);
+        buf2[buf2_len] = 0;

-      buf3_len = total_len - j;
-      strncpy(buf3, total+j, buf3_len);
-      buf3[buf3_len] = 0;
+        buf3_len = total_len - j;
+        strncpy(buf3, total+j, buf3_len);
+        buf3[buf3_len] = 0;

-      read = parse(buf1, buf1_len);
-      if (read != buf1_len) {
-        print_error(buf1, read);
-        goto error;
-      }
+        read = parse(buf1, buf1_len);
+        if (read != buf1_len) {
+          print_error(buf1, read);
+          goto error;
+        }

-      read = parse(buf2, buf2_len);
-      if (read != buf2_len) {
-        print_error(buf2, read);
-        goto error;
-      }
+        read = parse(buf2, buf2_len);
+        if (read != buf2_len) {
+          print_error(buf2, read);
+          goto error;
+        }

-      read = parse(buf3, buf3_len);
-      if (read != buf3_len) {
-        print_error(buf3, read);
-        goto error;
-      }
+        read = parse(buf3, buf3_len);
+        if (read != buf3_len) {
+          print_error(buf3, read);
+          goto error;
+        }

-      parse(NULL, 0);
+        parse(NULL, 0);

-      if (3 != num_messages) {
-        fprintf(stderr, "\n\nParser didn't see 3 messages only %d\n", num_messages);
-        goto error;
-      }
+        if (3 != num_messages) {
+          fprintf(stderr, "\n\nParser didn't see 3 messages only %d\n", num_messages);
+          goto error;
+        }

-      if (!message_eq(0, r1)) {
-        fprintf(stderr, "\n\nError matching messages[0] in test_scan.\n");
-        goto error;
-      }
+        if (!message_eq(0, r1)) {
+          fprintf(stderr, "\n\nError matching messages[0] in test_scan.\n");
+          goto error;
+        }

-      if (!message_eq(1, r2)) {
-        fprintf(stderr, "\n\nError matching messages[1] in test_scan.\n");
-        goto error;
-      }
+        if (!message_eq(1, r2)) {
+          fprintf(stderr, "\n\nError matching messages[1] in test_scan.\n");
+          goto error;
+        }

-      if (!message_eq(2, r3)) {
-        fprintf(stderr, "\n\nError matching messages[2] in test_scan.\n");
-        goto error;
-      }
+        if (!message_eq(2, r3)) {
+          fprintf(stderr, "\n\nError matching messages[2] in test_scan.\n");
+          goto error;
+        }

-      parser_free();
+        parser_free();
+      }
    }
  }
  puts("\b\b\b\b100%");
  return;

-error:
+ error:
  fprintf(stderr, "i=%d  j=%d\n", i, j);
  fprintf(stderr, "buf1 (%u) %s\n\n", (unsigned int)buf1_len, buf1);
  fprintf(stderr, "buf2 (%u) %s\n\n", (unsigned int)buf2_len , buf2);
@ -1395,12 +1458,18 @@ main (void)



-  printf("response scan 1/1      ");
+  printf("response scan 1/2      ");
  test_scan( &responses[TRAILING_SPACE_ON_CHUNKED_BODY]
           , &responses[NO_HEADERS_NO_BODY_404]
           , &responses[NO_REASON_PHRASE]
           );

+  printf("response scan 1/2      ");
+  test_scan( &responses[BONJOUR_MADAME_FR]
+           , &responses[UNDERSTORE_HEADER_KEY]
+           , &responses[NO_CARRIAGE_RET]
+           );
+
  puts("responses okay");


--- a/src/node_http_parser.cc
+++ b/src/node_http_parser.cc
@ -102,7 +102,7 @@ static struct http_parser_settings settings;


 static inline Persistent<String>
-method_to_str(enum http_method m) {
+method_to_str(unsigned short m) {
  switch (m) {
    case HTTP_DELETE:     return delete_sym;
    case HTTP_GET:        return get_sym;