Re: [PATCH 1/1] Extract content from HTTP response.

Marcel Holtmann Sat, 16 Oct 2010 03:12:30 -0700

Hi Mohamed,

> This patch add HTTP parsing to gweb, it will parse the HTTP
> response then send the body to caller. This patch also add
> support to chunk encoding.
> ---
>  gweb/gweb.c      |  314 
> +++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  gweb/gweb.h      |    3 +
>  tools/web-test.c |   18 +++-
>  3 files changed, 332 insertions(+), 3 deletions(-)


so good so far, but you really have to break this into smaller pieces.
This patch is too big.

> diff --git a/gweb/gweb.c b/gweb/gweb.c
> index 39f8ecf..c3990c8 100644
> --- a/gweb/gweb.c
> +++ b/gweb/gweb.c
> @@ -35,6 +35,39 @@
>  #include "gresolv.h"
>  #include "gweb.h"
>  
> +#define MAX_LINE_LEN 600
> +#define MAX_CHUNK_LEN        17

Where are these values coming from?

> +enum chunk_state {
> +     CHUNK_SIZE,
> +     CHUNK_R,
> +     CHUNK_R_BODY,
> +     CHUNK_N,
> +     CHUNK_N_BODY,
> +     CHUNK_DATA,
> +};
> +
> +struct web_chunk_data {
> +     enum chunk_state chunck_state;
> +     int chunk_size;
> +     int chunk_left;
> +};

I don't see you handing over this data, so you can just add these three
variables to web_data. That makes the access simpler.

So unless it seems to be a good idea to hand chunk_data to a function
this breakup in different structs is not needed at all.

> +struct web_data {
> +     char line_buf[MAX_LINE_LEN];
> +     int line_index;
> +
> +     unsigned long int total_len;
> +     unsigned long int content_len;
> +     unsigned long int http_status;
> +
> +     gboolean use_chunk;
> +     gboolean header_ready;
> +     gboolean done;
> +
> +     struct web_chunk_data chunk;
> +};
> +
>  struct web_session {
>       GWeb *web;
>  
> @@ -49,8 +82,11 @@ struct web_session {
>       guint resolv_action;
>       char *request;
>  
> +     GWebReceivedFunc received_func;
>       GWebResultFunc result_func;
>       gpointer result_data;
> +
> +     struct web_data data;
>  };

What is your plan here? Why so many child structs. I don't see them used
in that way.
 
>  struct _GWeb {
> @@ -191,12 +227,267 @@ gboolean g_web_add_nameserver(GWeb *web, const char 
> *address)
>       return TRUE;
>  }
>  
> +static void init_chunk_data(struct web_session *session)
> +{
> +     memset(session->data.line_buf, 0, MAX_LINE_LEN);
> +     session->data.line_index = 0;
> +
> +     session->data.chunk.chunck_state = CHUNK_SIZE;
> +     session->data.chunk.chunk_size = 0;
> +     session->data.chunk.chunk_left = 0;
> +}
> +
> +static void init_download_data(struct web_session *session)
> +{
> +
> +     init_chunk_data(session);
> +
> +     session->data.http_status = 0;
> +     session->data.total_len = 0;
> +     session->data.content_len = 0;
> +
> +     session->data.use_chunk = FALSE;
> +     session->data.header_ready = FALSE;
> +     session->data.done = FALSE;
> +}

For most pieces you can assume that g_try_new0 will set things to 0 and
to FALSE. So why bother? And this the whole memory for all of them is
allocated this way, this code does nothing new besides setting
CHUNK_SIZE.

> +static int append_to_line(struct web_session *session, char *buf, int len)
> +{
> +     char *ptr;
> +
> +     if ((session->data.line_index + len) >= MAX_LINE_LEN)
> +             return -EXFULL;
> +
> +     ptr = session->data.line_buf + session->data.line_index;
> +     memcpy(ptr, buf, len);
> +
> +     session->data.line_index += len;
> +     session->data.line_buf[session->data.line_index] = '\0';
> +
> +     return 0;
> +}

Maybe just using GByteArray is a bit simpler here. It will grow for you
and you can even have uint8 where GString would be more bound to
strings. However HTTP headers are strings, so please think about this a
little bit.

> +static int append_to_chunk_size(struct web_session *session, char *buf, int 
> len)
> +{
> +     if ((session->data.line_index + len) >= MAX_CHUNK_LEN)
> +             return -EXFULL;
> +
> +     return append_to_line(session, buf, len);
> +}
> +
> +static void send_client_payload(struct web_session *session, char *buf, int 
> len)
> +{
> +     if (session->received_func != NULL)
> +             session->received_func(buf, len, session->result_data);
> +}
> +
> +static int decode_chunked(struct web_session *session, char *buf, int len)
> +{
> +     int ret;
> +     unsigned int counter;
> +     struct web_data *data;
> +
> +     data = &session->data;

general rule is to a) use err instead of ret, b) to put int like i, err
etc. at the end of the variable list, c) do web_data *data = &sess->data
at the top of the list.

> +     while (len > 0) {
> +             switch (data->chunk.chunck_state) {
> +             case CHUNK_SIZE:
> +                     if (g_ascii_isxdigit(*buf) == TRUE) {
> +                             ret = append_to_chunk_size(session, buf, 1);
> +                             if (ret != 0)
> +                                     return ret;

Check should be always err < 0

> +                     } else {
> +                             ret = sscanf(session->data.line_buf,
> +                                                     "%x", &counter);
> +                             if (ret != 1)
> +                                     return -EILSEQ;
> +
> +                             data->chunk.chunk_size = counter;
> +                             data->chunk.chunk_left = counter;
> +
> +                             data->chunk.chunck_state = CHUNK_R;
> +                             break;
> +                     }
> +                     buf++;
> +                     len--;
> +                     break;
> +             case CHUNK_R:
> +             case CHUNK_R_BODY:
> +                     if (*buf == ' ') {
> +                             buf++;
> +                             len--;
> +                             break;
> +                     }
> +
> +                     if (*buf != '\r')
> +                             return -EILSEQ;
> +
> +                     if (data->chunk.chunck_state == CHUNK_R)
> +                             data->chunk.chunck_state = CHUNK_N;
> +                     else
> +                             data->chunk.chunck_state = CHUNK_N_BODY;
> +                     buf++;
> +                     len--;
> +                     break;
> +             case CHUNK_N:
> +             case CHUNK_N_BODY:
> +                     if (*buf != '\n')
> +                             return -EILSEQ;
> +
> +                     if (data->chunk.chunck_state == CHUNK_N)
> +                             data->chunk.chunck_state = CHUNK_DATA;
> +                     else
> +                             session->data.chunk.chunck_state = CHUNK_SIZE;
> +                     buf++;
> +                     len--;
> +                     break;
> +             case CHUNK_DATA:
> +                     if (data->chunk.chunk_size == 0) {
> +                             session->data.done = TRUE;
> +                             debug(session->web, "Download Done in chunk");
> +                             return 0;
> +                     }
> +
> +                     if (data->chunk.chunk_left <= len) {
> +                             send_client_payload(session, buf,
> +                                             data->chunk.chunk_left);
> +                             data->chunk.chunck_state = CHUNK_R_BODY;
> +                             len -= data->chunk.chunk_left;
> +                             buf += data->chunk.chunk_left;
> +                             data->total_len += data->chunk.chunk_left;
> +                             init_chunk_data(session);
> +                             data->chunk.chunck_state = CHUNK_R_BODY;
> +                             break;
> +                     }
> +                     /* more data */
> +                     send_client_payload(session, buf, len);
> +                     data->chunk.chunk_left -= len;
> +                     len -= len;
> +                     buf += len;
> +                     data->total_len += len;
> +                     break;
> +             }
> +     }
> +
> +     return 0;
> +}

I am not sure if this is really highly efficient here. I looks to me we
are wasting a lot of resources while we could just check for line end
marker and then right away process the HTTP header items.

> +static int decode_header(struct web_session *session,
> +                             char *buf, int len)
> +{
> +     char *ptr, *end_line, *str;
> +     int line_len;
> +     int ret;
> +
> +     ptr = buf;
> +
> +     while (len > 0) {
> +             end_line = memchr(ptr, '\n', len);
> +             if (end_line == NULL) {
> +                     if (append_to_line(session, ptr, len) < 0)
> +                             return -EXFULL;
> +                     return 0;
> +             }
> +
> +             line_len = end_line - ptr;
> +             line_len += 1;
> +
> +             if (append_to_line(session, ptr, line_len) < 0)
> +                     return -EXFULL;
> +
> +             if ((session->data.line_buf[0] == '\r') ||
> +                     (session->data.line_buf[0] == '\n')) {

The extra () are not needed.

> +                     /* empty line http header is done */
> +                     session->data.header_ready = TRUE;
> +                     debug(session->web, "content len:%lu http status%lu",
> +                                             session->data.content_len,
> +                                             session->data.http_status);
> +                     if (session->data.http_status != 0)
> +                             return (end_line - buf) + 1;
> +                     else
> +                             return -1;
> +             }
> +             /* first line should be http status */
> +             if (session->data.http_status == 0) {
> +
> +                     ret = sscanf(session->data.line_buf,
> +                                             "HTTP/1.%*d %lu",
> +                                             &session->data.http_status);
> +                     if ((ret != 1)) {

What are the extra () for? I don't see the point.

> +                             debug(session->web, "error status %lu",
> +                                             session->data.http_status);
> +                             return -1;
> +                     }
> +              } else if (g_ascii_strncasecmp("Transfer-Encoding:",
> +                                     session->data.line_buf, 18) == 0) {
> +                     str = g_strrstr(session->data.line_buf, "chunked");
> +                     if (str != NULL) {
> +                             init_chunk_data(session);
> +                             session->data.use_chunk = TRUE;
> +                     }
> +             } else if (g_ascii_strncasecmp("Content-Length:",
> +                                     session->data.line_buf, 15) == 0) {
> +                     sscanf(session->data.line_buf,
> +                             "Content-Length: %lu",
> +                             &session->data.content_len);

This is highly inefficient. You already scanned this key part. Just use
an offset into string.

> +             }
> +
> +             ptr += line_len;
> +             len -= line_len;
> +
> +             memset(session->data.line_buf, 0, MAX_LINE_LEN);
> +             session->data.line_index = 0;
> +     }
> +
> +     return 0;
> +}

Seems like we look at every single byte at least twice and at many of
them actually three times while receiving the header and body.

> +static int decode_function(struct web_session *session,
> +                                     char *buf, int len)
> +{
> +     int ret;
> +
> +     /* check if we still reading HTTP header */
> +     if (session->data.header_ready == FALSE) {
> +             ret = decode_header(session, buf, len);
> +
> +             if (ret < 0)
> +                     return ret;
> +             else if (ret == 0)
> +                     return 0;
> +
> +             memset(session->data.line_buf, 0, MAX_LINE_LEN);
> +             session->data.line_index = 0;
> +
> +             buf += ret;
> +             len -= ret;
> +     }
> +
> +     if (len <= 0)
> +             return 0;
> +
> +     if (session->data.use_chunk)
> +             return decode_chunked(session, buf, len);
> +
> +     session->data.total_len += len;
> +
> +     send_client_payload(session, buf, len);
> +
> +     if ((session->data.content_len != 0) &&
> +             (session->data.total_len >= session->data.content_len)) {
> +             debug(session->web, "Downloan complete");
> +             session->data.done = TRUE;
> +     }
> +
> +     return 0;
> +}
> +

So I would use GByteArray to receive the header. And keep track of the
offset. If the new chunk contains the line end marker, then go ahead and
parse the header item. If we care about it, store its value, if not,
then discard it and truncate that part of GByteArray and continue.

Once we finished with the header, process the body, but hand the chunks
directly to the callback and not even bother with its content.

>  static gboolean received_data(GIOChannel *channel, GIOCondition cond,
>                                                       gpointer user_data)
>  {
>       struct web_session *session = user_data;
> -     unsigned char buf[4096];
> +     char buf[4096];
>       int sk, len;
> +     int ret;
>  
>       if (cond & (G_IO_NVAL | G_IO_ERR | G_IO_HUP)) {
>               session->transport_watch = 0;
> @@ -216,7 +507,22 @@ static gboolean received_data(GIOChannel *channel, 
> GIOCondition cond,
>                       session->result_func(200, session->result_data);
>               return FALSE;
>       }
> -     printf("%s", buf);
> +
> +     ret = decode_function(session, buf, len);
> +
> +     if (ret != 0) {
> +             session->transport_watch = 0;
> +             if (session->result_func != NULL)
> +                     session->result_func(ret, session->result_data);
> +             return FALSE;
> +     }
> +     if (session->data.done == TRUE) {
> +             session->transport_watch = 0;
> +             if (session->result_func != NULL)
> +                     session->result_func(session->data.http_status,
> +                                                     session->result_data);
> +             return FALSE;
> +     }
>  
>       return TRUE;
>  }
> @@ -265,6 +571,8 @@ static void start_request(struct web_session *session)
>       debug(session->web, "request %s from %s",
>                                       session->request, session->host);
>  
> +     init_download_data(session);
> +
>       sk = g_io_channel_unix_get_fd(session->transport_channel);
>  
>       buf = g_string_new(NULL);
> @@ -366,6 +674,7 @@ static void resolv_result(GResolvResultStatus status,
>  }
>  
>  guint g_web_request(GWeb *web, GWebMethod method, const char *url,
> +                             GWebReceivedFunc rec_func,
>                               GWebResultFunc func, gpointer user_data)

We do need an extra callback after the headers have been read and before
we continue with the body. In a lot of cases just reading the header is
enough for us. Then we need to be able to return with either TRUE/FALSE
to indicate that we are done with downloading. Same goes for the actual
receive function. We wanna be to just stop download at any point in
time.

I am also thinking that we have to create a GWebSession object as well.
So where GWeb represents the interface which we use, and then
GWebSession represents the remote server. Since with proxies etc, we can
connect to server1, but request an URL for server2. So for complex usage
we need to split this.

However it is also good to have a simple g_web_request that just takes
care of asking PACrunner for the proxy etc. and just downloads the file
for us.

Maybe GWeb should be considered our web session and besides the index we
give it a hostname we expect it to connect to. And only if NULL, we run
a proxy resolution automatically. I do want a simple interface and
something stage with multiple steps. So this might be even a bit better.

I am open for ideas. And since we focus on GLib mainloop we can make
some assumption and don't go as crazy as CURL or other libraries have to
do for async download support.

Regards

Marcel


_______________________________________________
connman mailing list
[email protected]
http://lists.connman.net/listinfo/connman

Re: [PATCH 1/1] Extract content from HTTP response.

Reply via email to