mime_parser.c

00001 /*
00002  * $Id: mime_parser.c 5147 2007-05-08 15:36:22Z ajc $
00003  */
00012 #include "webcit.h"
00013 #include "webserver.h"
00014 #include "mime_parser.h"
00015 
00016 void extract_key(char *target, char *source, char *key)
00017 {
00018         int a, b;
00019 
00020         strcpy(target, source);
00021         for (a = 0; a < strlen(target); ++a) {
00022                 if ((!strncasecmp(&target[a], key, strlen(key)))
00023                     && (target[a + strlen(key)] == '=')) {
00024                         strcpy(target, &target[a + strlen(key) + 1]);
00025                         if (target[0] == 34)
00026                                 strcpy(target, &target[1]);
00027                         for (b = 0; b < strlen(target); ++b)
00028                                 if (target[b] == 34)
00029                                         target[b] = 0;
00030                         return;
00031                 }
00032         }
00033         strcpy(target, "");
00034 }
00035 
00036 
00037 /*
00038  * For non-multipart messages, we need to generate a quickie partnum of "1"
00039  * to return to callback functions.  Some callbacks demand it.
00040  */
00041 char *fixed_partnum(char *supplied_partnum) {
00042         if (supplied_partnum == NULL) return "1";
00043         if (strlen(supplied_partnum)==0) return "1";
00044         return supplied_partnum;
00045 }
00046 
00047 
00048 
00049 /*
00050  * Convert "quoted-printable" to binary.  Returns number of bytes decoded.
00051  * according to RFC2045 section 6.7
00052  */
00053 int CtdlDecodeQuotedPrintable(char *decoded, char *encoded, int sourcelen) {
00054         unsigned int ch;
00055         int decoded_length = 0;
00056         int pos = 0;
00057 
00058         while (pos < sourcelen)
00059         {
00060                 if (!strncmp(&encoded[pos], "=\r\n", 3))
00061                 {
00062                         pos += 3;
00063                 }
00064                 else if (!strncmp(&encoded[pos], "=\n", 2))
00065                 {
00066                         pos += 2;
00067                 }
00068                 else if (encoded[pos] == '=')
00069                 {
00070                         ch = 0;
00071                         sscanf(&encoded[pos+1], "%02x", &ch);
00072                         pos += 3;
00073                         decoded[decoded_length++] = ch;
00074                 }
00075                 else
00076                 {
00077                         decoded[decoded_length++] = encoded[pos];
00078                         pos += 1;
00079                 }
00080         }
00081         decoded[decoded_length] = 0;
00082         return(decoded_length);
00083 }
00084 
00085 
00086 /*
00087  * Given a message or message-part body and a length, handle any necessary
00088  * decoding and pass the request up the stack.
00089  */
00090 void mime_decode(char *partnum,
00091                  char *part_start, size_t length,
00092                  char *content_type, char *charset, char *encoding,
00093                  char *disposition,
00094                  char *name, char *filename,
00095                  void (*CallBack)
00096                   (char *cbname,
00097                    char *cbfilename,
00098                    char *cbpartnum,
00099                    char *cbdisp,
00100                    void *cbcontent,
00101                    char *cbtype,
00102                    char *cbcharset,
00103                    size_t cblength,
00104                    char *cbencoding,
00105                    void *cbuserdata),
00106                  void (*PreMultiPartCallBack)
00107                   (char *cbname,
00108                    char *cbfilename,
00109                    char *cbpartnum,
00110                    char *cbdisp,
00111                    void *cbcontent,
00112                    char *cbtype,
00113                    char *cbcharset,
00114                    size_t cblength,
00115                    char *cbencoding,
00116                    void *cbuserdata),
00117                  void (*PostMultiPartCallBack)
00118                   (char *cbname,
00119                    char *cbfilename,
00120                    char *cbpartnum,
00121                    char *cbdisp,
00122                    void *cbcontent,
00123                    char *cbtype,
00124                    char *cbcharset,
00125                    size_t cblength,
00126                    char *cbencoding,
00127                    void *cbuserdata),
00128                   void *userdata,
00129                   int dont_decode
00130 )
00131 {
00132 
00133         char *decoded;
00134         size_t bytes_decoded = 0;
00135 
00136         /* Some encodings aren't really encodings */
00137         if (!strcasecmp(encoding, "7bit"))
00138                 strcpy(encoding, "");
00139         if (!strcasecmp(encoding, "8bit"))
00140                 strcpy(encoding, "");
00141         if (!strcasecmp(encoding, "binary"))
00142                 strcpy(encoding, "");
00143 
00144         /* If this part is not encoded, send as-is */
00145         if ( (strlen(encoding) == 0) || (dont_decode)) {
00146                 if (CallBack != NULL) {
00147                         CallBack(name, filename, fixed_partnum(partnum),
00148                                 disposition, part_start,
00149                                 content_type, charset, length, encoding, userdata);
00150                         }
00151                 return;
00152         }
00153         
00154         /* Fail silently if we hit an unknown encoding. */
00155         if ((strcasecmp(encoding, "base64"))
00156             && (strcasecmp(encoding, "quoted-printable"))) {
00157                 return;
00158         }
00159 
00160         /*
00161          * Allocate a buffer for the decoded data.  The output buffer is slightly
00162          * larger than the input buffer; this assumes that the decoded data
00163          * will never be significantly larger than the encoded data.  This is a
00164          * safe assumption with base64, uuencode, and quoted-printable.
00165          */
00166         decoded = malloc(length + 32768);
00167         if (decoded == NULL) {
00168                 return;
00169         }
00170 
00171         if (!strcasecmp(encoding, "base64")) {
00172                 bytes_decoded = CtdlDecodeBase64(decoded, part_start, length);
00173         }
00174         else if (!strcasecmp(encoding, "quoted-printable")) {
00175                 bytes_decoded = CtdlDecodeQuotedPrintable(decoded, part_start, length);
00176         }
00177 
00178         if (bytes_decoded > 0) if (CallBack != NULL) {
00179                 CallBack(name, filename, fixed_partnum(partnum),
00180                         disposition, decoded,
00181                         content_type, charset, bytes_decoded, "binary", userdata);
00182         }
00183 
00184         free(decoded);
00185 }
00186 
00187 /*
00188  * Break out the components of a multipart message
00189  * (This function expects to be fed HEADERS + CONTENT)
00190  * Note: NULL can be supplied as content_end; in this case, the message is
00191  * considered to have ended when the parser encounters a 0x00 byte.
00192  */
00193 void the_mime_parser(char *partnum,
00194                      char *content_start, char *content_end,
00195                      void (*CallBack)
00196                       (char *cbname,
00197                        char *cbfilename,
00198                        char *cbpartnum,
00199                        char *cbdisp,
00200                        void *cbcontent,
00201                        char *cbtype,
00202                        char *cbcharset,
00203                        size_t cblength,
00204                        char *cbencoding,
00205                        void *cbuserdata),
00206                      void (*PreMultiPartCallBack)
00207                       (char *cbname,
00208                        char *cbfilename,
00209                        char *cbpartnum,
00210                        char *cbdisp,
00211                        void *cbcontent,
00212                        char *cbtype,
00213                        char *cbcharset,
00214                        size_t cblength,
00215                        char *cbencoding,
00216                        void *cbuserdata),
00217                      void (*PostMultiPartCallBack)
00218                       (char *cbname,
00219                        char *cbfilename,
00220                        char *cbpartnum,
00221                        char *cbdisp,
00222                        void *cbcontent,
00223                        char *cbtype,
00224                        char *cbcharset,
00225                        size_t cblength,
00226                        char *cbencoding,
00227                        void *cbuserdata),
00228                       void *userdata,
00229                       int dont_decode
00230 )
00231 {
00232 
00233         char *ptr;
00234         char *srch = NULL;
00235         char *part_start, *part_end = NULL;
00236         char buf[SIZ];
00237         char *header;
00238         char *boundary;
00239         char *startary;
00240         size_t startary_len = 0;
00241         char *endary;
00242         char *next_boundary;
00243         char *content_type;
00244         char *charset;
00245         size_t content_length;
00246         char *encoding;
00247         char *disposition;
00248         char *name = NULL;
00249         char *content_type_name;
00250         char *content_disposition_name;
00251         char *filename;
00252         int is_multipart;
00253         int part_seq = 0;
00254         int i;
00255         size_t length;
00256         char nested_partnum[256];
00257         int crlf_in_use = 0;
00258         char *evaluate_crlf_ptr = NULL;
00259 
00260         ptr = content_start;
00261         content_length = 0;
00262 
00263         boundary = malloc(SIZ);
00264         memset(boundary, 0, SIZ);
00265 
00266         startary = malloc(SIZ);
00267         memset(startary, 0, SIZ);
00268 
00269         endary = malloc(SIZ);
00270         memset(endary, 0, SIZ);
00271 
00272         header = malloc(SIZ);
00273         memset(header, 0, SIZ);
00274 
00275         content_type = malloc(SIZ);
00276         memset(content_type, 0, SIZ);
00277 
00278         charset = malloc(SIZ);
00279         memset(charset, 0, SIZ);
00280 
00281         encoding = malloc(SIZ);
00282         memset(encoding, 0, SIZ);
00283 
00284         content_type_name = malloc(SIZ);
00285         memset(content_type_name, 0, SIZ);
00286 
00287         content_disposition_name = malloc(SIZ);
00288         memset(content_disposition_name, 0, SIZ);
00289 
00290         filename = malloc(SIZ);
00291         memset(filename, 0, SIZ);
00292 
00293         disposition = malloc(SIZ);
00294         memset(disposition, 0, SIZ);
00295 
00296         /* If the caller didn't supply an endpointer, generate one by measure */
00297         if (content_end == NULL) {
00298                 content_end = &content_start[strlen(content_start)];
00299         }
00300 
00301         /* Learn interesting things from the headers */
00302         strcpy(header, "");
00303         do {
00304                 ptr = memreadline(ptr, buf, SIZ);
00305                 if (ptr >= content_end) {
00306                         goto end_parser;
00307                 }
00308 
00309                 for (i = 0; i < strlen(buf); ++i) {
00310                         if (isspace(buf[i])) {
00311                                 buf[i] = ' ';
00312                         }
00313                 }
00314 
00315                 if (!isspace(buf[0])) {
00316                         if (!strncasecmp(header, "Content-type:", 13)) {
00317                                 strcpy(content_type, &header[13]);
00318                                 striplt(content_type);
00319                                 extract_key(content_type_name, content_type, "name");
00320                                 extract_key(charset, content_type, "charset");
00321                                 /* Deal with weird headers */
00322                                 if (strchr(content_type, ' '))
00323                                         *(strchr(content_type, ' ')) = '\0';
00324                                 if (strchr(content_type, ';'))
00325                                         *(strchr(content_type, ';')) = '\0';
00326                         }
00327                         if (!strncasecmp(header, "Content-Disposition:", 20)) {
00328                                 strcpy(disposition, &header[20]);
00329                                 striplt(disposition);
00330                                 extract_key(content_disposition_name, disposition, "name");
00331                                 extract_key(filename, disposition, "filename");
00332                         }
00333                         if (!strncasecmp(header, "Content-length: ", 15)) {
00334                                 char clbuf[10];
00335                                 safestrncpy(clbuf, &header[15], sizeof clbuf);
00336                                 striplt(clbuf);
00337                                 content_length = (size_t) atol(clbuf);
00338                         }
00339                         if (!strncasecmp(header, "Content-transfer-encoding: ", 26)) {
00340                                 strcpy(encoding, &header[26]);
00341                                 striplt(encoding);
00342                         }
00343                         if (strlen(boundary) == 0)
00344                                 extract_key(boundary, header, "boundary");
00345                         strcpy(header, "");
00346                 }
00347                 if ((strlen(header) + strlen(buf) + 2) < SIZ) {
00348                         strcat(header, buf);
00349                 }
00350         } while ((strlen(buf) > 0) && (*ptr != 0));
00351 
00352         if (strchr(disposition, ';'))
00353                 *(strchr(disposition, ';')) = '\0';
00354         striplt(disposition);
00355         if (strchr(content_type, ';'))
00356                 *(strchr(content_type, ';')) = '\0';
00357         striplt(content_type);
00358 
00359         if (strlen(boundary) > 0) {
00360                 is_multipart = 1;
00361         } else {
00362                 is_multipart = 0;
00363         }
00364 
00365         /* If this is a multipart message, then recursively process it */
00366         part_start = NULL;
00367         if (is_multipart) {
00368 
00369                 /* Tell the client about this message's multipartedness */
00370                 if (PreMultiPartCallBack != NULL) {
00371                         PreMultiPartCallBack("", "", partnum, "",
00372                                 NULL, content_type, charset,
00373                                 0, encoding, userdata);
00374                 }
00375 
00376                 /* Figure out where the boundaries are */
00377                 snprintf(startary, SIZ, "--%s", boundary);
00378                 snprintf(endary, SIZ, "--%s--", boundary);
00379                 startary_len = strlen(startary);
00380 
00381                 part_start = NULL;
00382                 do {
00383                         next_boundary = NULL;
00384                         for (srch=ptr; srch<content_end; ++srch) {
00385                                 if (!memcmp(srch, startary, startary_len)) {
00386                                         next_boundary = srch;
00387                                         srch = content_end;
00388                                 }
00389                         }
00390 
00391                         if ( (part_start != NULL) && (next_boundary != NULL) ) {
00392                                 part_end = next_boundary;
00393                                 --part_end;             /* omit the trailing LF */
00394                                 if (crlf_in_use) {
00395                                         --part_end;     /* omit the trailing CR */
00396                                 }
00397 
00398                                 if (strlen(partnum) > 0) {
00399                                         snprintf(nested_partnum,
00400                                                  sizeof nested_partnum,
00401                                                  "%s.%d", partnum,
00402                                                  ++part_seq);
00403                                 }
00404                                 else {
00405                                         snprintf(nested_partnum,
00406                                                  sizeof nested_partnum,
00407                                                  "%d", ++part_seq);
00408                                 }
00409                                 the_mime_parser(nested_partnum,
00410                                             part_start, part_end,
00411                                                 CallBack,
00412                                                 PreMultiPartCallBack,
00413                                                 PostMultiPartCallBack,
00414                                                 userdata,
00415                                                 dont_decode);
00416                         }
00417 
00418                         if (next_boundary != NULL) {
00419                                 /* If we pass out of scope, don't attempt to
00420                                  * read past the end boundary. */
00421                                 if (!strcmp(next_boundary, endary)) {
00422                                         ptr = content_end;
00423                                 }
00424                                 else {
00425                                         /* Set up for the next part. */
00426                                         part_start = strstr(next_boundary, "\n");
00427                                         
00428                                         /* Determine whether newlines are LF or CRLF */
00429                                         evaluate_crlf_ptr = part_start;
00430                                         --evaluate_crlf_ptr;
00431                                         if (!memcmp(evaluate_crlf_ptr, "\r\n", 2)) {
00432                                                 crlf_in_use = 1;
00433                                         }
00434                                         else {
00435                                                 crlf_in_use = 0;
00436                                         }
00437 
00438                                         /* Advance past the LF ... now we're in the next part */
00439                                         ++part_start;
00440                                         ptr = part_start;
00441                                 }
00442                         }
00443                         else {
00444                                 /* Invalid end of multipart.  Bail out! */
00445                                 ptr = content_end;
00446                         }
00447                 } while ( (ptr < content_end) && (next_boundary != NULL) );
00448 
00449                 if (PostMultiPartCallBack != NULL) {
00450                         PostMultiPartCallBack("", "", partnum, "", NULL,
00451                                 content_type, charset, 0, encoding, userdata);
00452                 }
00453                 goto end_parser;
00454         }
00455 
00456         /* If it's not a multipart message, then do something with it */
00457         if (!is_multipart) {
00458                 part_start = ptr;
00459                 length = 0;
00460                 while (ptr < content_end) {
00461                         ++ptr;
00462                         ++length;
00463                 }
00464                 part_end = content_end;
00465 
00466                 /******
00467                  * I thought there was an off-by-one error here, but there isn't.
00468                  * This probably means that there's an off-by-one error somewhere
00469                  * else ... or maybe only in certain messages?
00470                 --part_end;
00471                 --length;
00472                 ******/
00473                 
00474                 /* Truncate if the header told us to */
00475                 if ( (content_length > 0) && (length > content_length) ) {
00476                         length = content_length;
00477                 }
00478 
00479                 /* Sometimes the "name" field is tacked on to Content-type,
00480                  * and sometimes it's tacked on to Content-disposition.  Use
00481                  * whichever one we have.
00482                  */
00483                 if (strlen(content_disposition_name) > strlen(content_type_name)) {
00484                         name = content_disposition_name;
00485                 }
00486                 else {
00487                         name = content_type_name;
00488                 }
00489         
00490                 /* lprintf(CTDL_DEBUG, "mime_decode part=%s, len=%d, type=%s, charset=%s, encoding=%s\n",
00491                         partnum, length, content_type, charset, encoding); */
00492 
00493                 /* Ok, we've got a non-multipart part here, so do something with it.
00494                  */
00495                 mime_decode(partnum,
00496                         part_start, length,
00497                         content_type, charset, encoding, disposition,
00498                         name, filename,
00499                         CallBack, NULL, NULL,
00500                         userdata, dont_decode
00501                 );
00502 
00503                 /*
00504                  * Now if it's an encapsulated message/rfc822 then we have to recurse into it
00505                  */
00506                 if (!strcasecmp(content_type, "message/rfc822")) {
00507 
00508                         if (PreMultiPartCallBack != NULL) {
00509                                 PreMultiPartCallBack("", "", partnum, "",
00510                                         NULL, content_type, charset,
00511                                         0, encoding, userdata);
00512                         }
00513                         if (CallBack != NULL) {
00514                                 if (strlen(partnum) > 0) {
00515                                         snprintf(nested_partnum,
00516                                                  sizeof nested_partnum,
00517                                                  "%s.%d", partnum,
00518                                                  ++part_seq);
00519                                 }
00520                                 else {
00521                                         snprintf(nested_partnum,
00522                                                  sizeof nested_partnum,
00523                                                  "%d", ++part_seq);
00524                                 }
00525                                 the_mime_parser(nested_partnum,
00526                                         part_start, part_end,
00527                                         CallBack,
00528                                         PreMultiPartCallBack,
00529                                         PostMultiPartCallBack,
00530                                         userdata,
00531                                         dont_decode
00532                                 );
00533                         }
00534                         if (PostMultiPartCallBack != NULL) {
00535                                 PostMultiPartCallBack("", "", partnum, "", NULL,
00536                                         content_type, charset, 0, encoding, userdata);
00537                         }
00538 
00539 
00540                 }
00541 
00542         }
00543 
00544 end_parser:     /* free the buffers!  end the oppression!! */
00545         free(boundary);
00546         free(startary);
00547         free(endary);   
00548         free(header);
00549         free(content_type);
00550         free(charset);
00551         free(encoding);
00552         free(content_type_name);
00553         free(content_disposition_name);
00554         free(filename);
00555         free(disposition);
00556 }
00557 
00558 
00559 
00560 /*
00561  * Entry point for the MIME parser.
00562  * (This function expects to be fed HEADERS + CONTENT)
00563  * Note: NULL can be supplied as content_end; in this case, the message is
00564  * considered to have ended when the parser encounters a 0x00 byte.
00565  */
00566 void mime_parser(char *content_start,
00567                 char *content_end,
00568 
00569                  void (*CallBack)
00570                   (char *cbname,
00571                    char *cbfilename,
00572                    char *cbpartnum,
00573                    char *cbdisp,
00574                    void *cbcontent,
00575                    char *cbtype,
00576                    char *cbcharset,
00577                    size_t cblength,
00578                    char *cbencoding,
00579                    void *cbuserdata),
00580 
00581                  void (*PreMultiPartCallBack)
00582                   (char *cbname,
00583                    char *cbfilename,
00584                    char *cbpartnum,
00585                    char *cbdisp,
00586                    void *cbcontent,
00587                    char *cbtype,
00588                    char *cbcharset,
00589                    size_t cblength,
00590                    char *cbencoding,
00591                    void *cbuserdata),
00592 
00593                  void (*PostMultiPartCallBack)
00594                   (char *cbname,
00595                    char *cbfilename,
00596                    char *cbpartnum,
00597                    char *cbdisp,
00598                    void *cbcontent,
00599                    char *cbtype,
00600                    char *cbcharset,
00601                    size_t cblength,
00602                    char *cbencoding,
00603                    void *cbuserdata),
00604 
00605                   void *userdata,
00606                   int dont_decode
00607 )
00608 {
00609 
00610         the_mime_parser("", content_start, content_end,
00611                         CallBack,
00612                         PreMultiPartCallBack,
00613                         PostMultiPartCallBack,
00614                         userdata, dont_decode);
00615 }
00616 

Generated on Wed Jun 20 23:13:09 2007 for webcit by  doxygen 1.5.2