summaryrefslogtreecommitdiffstats
path: root/modules/filters/mod_charset_lite.c
blob: e3d1ce9431da8466a8cd0c355eb567d0e4bc0612 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
/* Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*
 * simple hokey charset recoding configuration module
 *
 * See mod_ebcdic and mod_charset for more thought-out examples.  This
 * one is just so Jeff can learn how a module works and experiment with
 * basic character set recoding configuration.
 *
 * !!!This is an extremely cheap ripoff of mod_charset.c from Russian Apache!!!
 */

#include "httpd.h"
#include "http_config.h"

#include "http_core.h"
#include "http_log.h"
#include "http_main.h"
#include "http_protocol.h"
#include "http_request.h"
#include "util_charset.h"
#include "apr_buckets.h"
#include "util_filter.h"
#include "apr_strings.h"
#include "apr_lib.h"
#include "apr_xlate.h"
#define APR_WANT_STRFUNC
#include "apr_want.h"

#define OUTPUT_XLATE_BUF_SIZE (16*1024) /* size of translation buffer used on output */
#define INPUT_XLATE_BUF_SIZE  (8*1024)  /* size of translation buffer used on input */

#define XLATE_MIN_BUFF_LEFT 128  /* flush once there is no more than this much
                                  * space left in the translation buffer
                                  */

#define FATTEST_CHAR  8          /* we don't handle chars wider than this that straddle
                                  * two buckets
                                  */

/* extended error status codes; this is used in addition to an apr_status_t to
 * track errors in the translation filter
 */
typedef enum {
    EES_INIT = 0,   /* no error info yet; value must be 0 for easy init */
    EES_LIMIT,      /* built-in restriction encountered */
    EES_INCOMPLETE_CHAR, /* incomplete multi-byte char at end of content */
    EES_BUCKET_READ,
    EES_DOWNSTREAM, /* something bad happened in a filter below xlate */
    EES_BAD_INPUT   /* input data invalid */
} ees_t;

/* registered name of the output translation filter */
#define XLATEOUT_FILTER_NAME "XLATEOUT"
/* registered name of input translation filter */
#define XLATEIN_FILTER_NAME  "XLATEIN"

typedef struct charset_dir_t {
    const char *charset_source; /* source encoding */
    const char *charset_default; /* how to ship on wire */
    /** module does ap_add_*_filter()? */
    enum {IA_INIT, IA_IMPADD, IA_NOIMPADD} implicit_add;
    /** treat all mimetypes as text? */
    enum {FX_INIT, FX_FORCE, FX_NOFORCE} force_xlate;
} charset_dir_t;

/* charset_filter_ctx_t is created for each filter instance; because the same
 * filter code is used for translating in both directions, we need this context
 * data to tell the filter which translation handle to use; it also can hold a
 * character which was split between buckets
 */
typedef struct charset_filter_ctx_t {
    apr_xlate_t *xlate;
    int is_sb;              /* single-byte translation? */
    charset_dir_t *dc;
    ees_t ees;              /* extended error status */
    apr_size_t saved;
    char buf[FATTEST_CHAR]; /* we want to be able to build a complete char here */
    int ran;                /* has filter instance run before? */
    int noop;               /* should we pass brigades through unchanged? */
    char *tmp;              /* buffer for input filtering */
    apr_bucket_brigade *bb; /* input buckets we couldn't finish translating */
    apr_bucket_brigade *tmpbb; /* used for passing downstream */
} charset_filter_ctx_t;

/* charset_req_t is available via r->request_config if any translation is
 * being performed
 */
typedef struct charset_req_t {
    charset_dir_t *dc;
    charset_filter_ctx_t *output_ctx, *input_ctx;
} charset_req_t;

module AP_MODULE_DECLARE_DATA charset_lite_module;

static void *create_charset_dir_conf(apr_pool_t *p,char *dummy)
{
    charset_dir_t *dc = (charset_dir_t *)apr_pcalloc(p,sizeof(charset_dir_t));

    return dc;
}

static void *merge_charset_dir_conf(apr_pool_t *p, void *basev, void *overridesv)
{
    charset_dir_t *a = (charset_dir_t *)apr_pcalloc (p, sizeof(charset_dir_t));
    charset_dir_t *base = (charset_dir_t *)basev,
        *over = (charset_dir_t *)overridesv;

    /* If it is defined in the current container, use it.  Otherwise, use the one
     * from the enclosing container.
     */

    a->charset_default =
        over->charset_default ? over->charset_default : base->charset_default;
    a->charset_source =
        over->charset_source ? over->charset_source : base->charset_source;
    a->implicit_add =
        over->implicit_add != IA_INIT ? over->implicit_add : base->implicit_add;
    a->force_xlate=
        over->force_xlate != FX_INIT ? over->force_xlate : base->force_xlate;
    return a;
}

/* CharsetSourceEnc charset
 */
static const char *add_charset_source(cmd_parms *cmd, void *in_dc,
                                      const char *name)
{
    charset_dir_t *dc = in_dc;

    dc->charset_source = name;
    return NULL;
}

/* CharsetDefault charset
 */
static const char *add_charset_default(cmd_parms *cmd, void *in_dc,
                                       const char *name)
{
    charset_dir_t *dc = in_dc;

    dc->charset_default = name;
    return NULL;
}

/* CharsetOptions optionflag...
 */
static const char *add_charset_options(cmd_parms *cmd, void *in_dc,
                                       const char *flag)
{
    charset_dir_t *dc = in_dc;

    if (!strcasecmp(flag, "ImplicitAdd")) {
        dc->implicit_add = IA_IMPADD;
    }
    else if (!strcasecmp(flag, "NoImplicitAdd")) {
        dc->implicit_add = IA_NOIMPADD;
    }
    else if (!strcasecmp(flag, "TranslateAllMimeTypes")) {
        dc->force_xlate = FX_FORCE;
    }
    else if (!strcasecmp(flag, "NoTranslateAllMimeTypes")) {
        dc->force_xlate = FX_NOFORCE;
    }
    else {
        return apr_pstrcat(cmd->temp_pool,
                           "Invalid CharsetOptions option: ",
                           flag,
                           NULL);
    }

    return NULL;
}

/* find_code_page() is a fixup hook that checks if the module is
 * configured and the input or output potentially need to be translated.
 * If so, context is initialized for the filters.
 */
static int find_code_page(request_rec *r)
{
    charset_dir_t *dc = ap_get_module_config(r->per_dir_config,
                                             &charset_lite_module);
    charset_req_t *reqinfo;
    charset_filter_ctx_t *input_ctx, *output_ctx;
    apr_status_t rv;

    ap_log_rerror(APLOG_MARK, APLOG_TRACE3, 0, r,
                  "uri: %s file: %s method: %d "
                  "imt: %s flags: %s%s%s %s->%s",
                  r->uri,
                  r->filename ? r->filename : "(none)",
                  r->method_number,
                  r->content_type ? r->content_type : "(unknown)",
                  r->main     ? "S" : "",    /* S if subrequest */
                  r->prev     ? "R" : "",    /* R if redirect */
                  r->proxyreq ? "P" : "",    /* P if proxy */
                  dc->charset_source, dc->charset_default);

    /* If we don't have a full directory configuration, bail out.
     */
    if (!dc->charset_source || !dc->charset_default) {
        ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r, APLOGNO(01448)
                      "incomplete configuration: src %s, dst %s",
                      dc->charset_source ? dc->charset_source : "unspecified",
                      dc->charset_default ? dc->charset_default : "unspecified");
        return DECLINED;
    }

    /* catch proxy requests */
    if (r->proxyreq) {
        return DECLINED;
    }

    /* mod_rewrite indicators */
    if (r->filename
        && (!strncmp(r->filename, "redirect:", 9)
            || !strncmp(r->filename, "gone:", 5)
            || !strncmp(r->filename, "passthrough:", 12)
            || !strncmp(r->filename, "forbidden:", 10))) {
        return DECLINED;
    }

    /* no translation when server and network charsets are set to the same value */
    if (!strcasecmp(dc->charset_source, dc->charset_default)) {
        return DECLINED;
    }

    /* Get storage for the request data and the output filter context.
     * We rarely need the input filter context, so allocate that separately.
     */
    reqinfo = (charset_req_t *)apr_pcalloc(r->pool,
                                           sizeof(charset_req_t) +
                                           sizeof(charset_filter_ctx_t));
    output_ctx = (charset_filter_ctx_t *)(reqinfo + 1);

    reqinfo->dc = dc;
    output_ctx->dc = dc;
    output_ctx->tmpbb = apr_brigade_create(r->pool,
                                           r->connection->bucket_alloc);
    ap_set_module_config(r->request_config, &charset_lite_module, reqinfo);

    reqinfo->output_ctx = output_ctx;

    switch (r->method_number) {
    case M_PUT:
    case M_POST:
        /* Set up input translation.  Note: A request body can be included
         * with the OPTIONS method, but for now we don't set up translation
         * of it.
         */
        input_ctx = apr_pcalloc(r->pool, sizeof(charset_filter_ctx_t));
        input_ctx->bb = apr_brigade_create(r->pool,
                                           r->connection->bucket_alloc);
        input_ctx->tmp = apr_palloc(r->pool, INPUT_XLATE_BUF_SIZE);
        input_ctx->dc = dc;
        reqinfo->input_ctx = input_ctx;
        rv = apr_xlate_open(&input_ctx->xlate, dc->charset_source,
                            dc->charset_default, r->pool);
        if (rv != APR_SUCCESS) {
            ap_log_rerror(APLOG_MARK, APLOG_ERR, rv, r, APLOGNO(01449)
                          "can't open translation %s->%s",
                          dc->charset_default, dc->charset_source);
            return HTTP_INTERNAL_SERVER_ERROR;
        }
        if (apr_xlate_sb_get(input_ctx->xlate, &input_ctx->is_sb) != APR_SUCCESS) {
            input_ctx->is_sb = 0;
        }
    }

    return DECLINED;
}

static int configured_in_list(request_rec *r, const char *filter_name,
                              struct ap_filter_t *filter_list)
{
    struct ap_filter_t *filter = filter_list;

    while (filter) {
        if (!strcasecmp(filter_name, filter->frec->name)) {
            return 1;
        }
        filter = filter->next;
    }
    return 0;
}

static int configured_on_input(request_rec *r, const char *filter_name)
{
    return configured_in_list(r, filter_name, r->input_filters);
}

static int configured_on_output(request_rec *r, const char *filter_name)
{
    return configured_in_list(r, filter_name, r->output_filters);
}

/* xlate_insert_filter() is a filter hook which decides whether or not
 * to insert a translation filter for the current request.
 */
static void xlate_insert_filter(request_rec *r)
{
    /* Hey... don't be so quick to use reqinfo->dc here; reqinfo may be NULL */
    charset_req_t *reqinfo = ap_get_module_config(r->request_config,
                                                  &charset_lite_module);
    charset_dir_t *dc = ap_get_module_config(r->per_dir_config,
                                             &charset_lite_module);

    if (dc && (dc->implicit_add == IA_NOIMPADD)) {
        ap_log_rerror(APLOG_MARK, APLOG_TRACE6, 0, r,
                      "xlate output filter not added implicitly because "
                      "CharsetOptions included 'NoImplicitAdd'");
        return;
    }

    if (reqinfo) {
        if (reqinfo->output_ctx && !configured_on_output(r, XLATEOUT_FILTER_NAME)) {
            ap_add_output_filter(XLATEOUT_FILTER_NAME, reqinfo->output_ctx, r,
                                 r->connection);
        }
        ap_log_rerror(APLOG_MARK, APLOG_TRACE3, 0, r,
                      "xlate output filter not added implicitly because %s",
                      !reqinfo->output_ctx ?
                      "no output configuration available" :
                      "another module added the filter");

        if (reqinfo->input_ctx && !configured_on_input(r, XLATEIN_FILTER_NAME)) {
            ap_add_input_filter(XLATEIN_FILTER_NAME, reqinfo->input_ctx, r,
                                r->connection);
        }
        ap_log_rerror(APLOG_MARK, APLOG_TRACE3, 0, r,
                      "xlate input filter not added implicitly because %s",
                      !reqinfo->input_ctx ?
                      "no input configuration available" :
                      "another module added the filter");
    }
}

/* stuff that sucks that I know of:
 *
 * bucket handling:
 *  why create an eos bucket when we see it come down the stream?  just send the one
 *  passed as input...  news flash: this will be fixed when xlate_out_filter() starts
 *  using the more generic xlate_brigade()
 *
 * translation mechanics:
 *   we don't handle characters that straddle more than two buckets; an error
 *   will be generated
 */

static apr_status_t send_bucket_downstream(ap_filter_t *f, apr_bucket *b)
{
    charset_filter_ctx_t *ctx = f->ctx;
    apr_status_t rv;

    APR_BRIGADE_INSERT_TAIL(ctx->tmpbb, b);
    rv = ap_pass_brigade(f->next, ctx->tmpbb);
    if (rv != APR_SUCCESS) {
        ctx->ees = EES_DOWNSTREAM;
    }
    apr_brigade_cleanup(ctx->tmpbb);
    return rv;
}

/* send_downstream() is passed the translated data; it puts it in a single-
 * bucket brigade and passes the brigade to the next filter
 */
static apr_status_t send_downstream(ap_filter_t *f, const char *tmp, apr_size_t len)
{
    request_rec *r = f->r;
    conn_rec *c = r->connection;
    apr_bucket *b;

    b = apr_bucket_transient_create(tmp, len, c->bucket_alloc);
    return send_bucket_downstream(f, b);
}

static apr_status_t send_eos(ap_filter_t *f)
{
    request_rec *r = f->r;
    conn_rec *c = r->connection;
    apr_bucket_brigade *bb;
    apr_bucket *b;
    charset_filter_ctx_t *ctx = f->ctx;
    apr_status_t rv;

    bb = apr_brigade_create(r->pool, c->bucket_alloc);
    b = apr_bucket_eos_create(c->bucket_alloc);
    APR_BRIGADE_INSERT_TAIL(bb, b);
    rv = ap_pass_brigade(f->next, bb);
    if (rv != APR_SUCCESS) {
        ctx->ees = EES_DOWNSTREAM;
    }
    return rv;
}

static apr_status_t set_aside_partial_char(charset_filter_ctx_t *ctx,
                                           const char *partial,
                                           apr_size_t partial_len)
{
    apr_status_t rv;

    if (sizeof(ctx->buf) > partial_len) {
        ctx->saved = partial_len;
        memcpy(ctx->buf, partial, partial_len);
        rv = APR_SUCCESS;
    }
    else {
        rv = APR_INCOMPLETE;
        ctx->ees = EES_LIMIT; /* we don't handle chars this wide which straddle
                               * buckets
                               */
    }
    return rv;
}

static apr_status_t finish_partial_char(charset_filter_ctx_t *ctx,
                                        /* input buffer: */
                                        const char **cur_str,
                                        apr_size_t *cur_len,
                                        /* output buffer: */
                                        char **out_str,
                                        apr_size_t *out_len)
{
    apr_status_t rv;
    apr_size_t tmp_input_len;

    /* Keep adding bytes from the input string to the saved string until we
     *    1) finish the input char
     *    2) get an error
     * or 3) run out of bytes to add
     */

    do {
        ctx->buf[ctx->saved] = **cur_str;
        ++ctx->saved;
        ++*cur_str;
        --*cur_len;
        tmp_input_len = ctx->saved;
        rv = apr_xlate_conv_buffer(ctx->xlate,
                                   ctx->buf,
                                   &tmp_input_len,
                                   *out_str,
                                   out_len);
    } while (rv == APR_INCOMPLETE && *cur_len);

    if (rv == APR_SUCCESS) {
        ctx->saved = 0;
    }
    else {
        ctx->ees = EES_LIMIT; /* code isn't smart enough to handle chars
                               * straddling more than two buckets
                               */
    }

    return rv;
}

static void log_xlate_error(ap_filter_t *f, apr_status_t rv)
{
    charset_filter_ctx_t *ctx = f->ctx;
    const char *msg;
    char msgbuf[100];
    apr_size_t len;

    switch(ctx->ees) {
    case EES_LIMIT:
        rv = 0;
        msg = APLOGNO(02193) "xlate filter - a built-in restriction was encountered";
        break;
    case EES_BAD_INPUT:
        rv = 0;
        msg = APLOGNO(02194) "xlate filter - an input character was invalid";
        break;
    case EES_BUCKET_READ:
        rv = 0;
        msg = APLOGNO(02195) "xlate filter - bucket read routine failed";
        break;
    case EES_INCOMPLETE_CHAR:
        rv = 0;
        strcpy(msgbuf, APLOGNO(02196) "xlate filter - incomplete char at end of input - ");
        len = ctx->saved;

        /* We must ensure not to process more than what would fit in the
         * remaining of the destination buffer, including terminating NULL */
        if (len > (sizeof(msgbuf) - strlen(msgbuf) - 1) / 2)
            len = (sizeof(msgbuf) - strlen(msgbuf) - 1) / 2;

        ap_bin2hex(ctx->buf, len, msgbuf + strlen(msgbuf));
        msg = msgbuf;
        break;
    case EES_DOWNSTREAM:
        msg = APLOGNO(02197) "xlate filter - an error occurred in a lower filter";
        break;
    default:
        msg = APLOGNO(02198) "xlate filter - returning error";
    }
    ap_log_rerror(APLOG_MARK, APLOG_ERR, rv, f->r, APLOGNO(02997) "%s", msg);
}

/* chk_filter_chain() is called once per filter instance; it tries to
 * determine if the current filter instance should be disabled because
 * its translation is incompatible with the translation of an existing
 * instance of the translate filter
 *
 * Example bad scenario:
 *
 *   configured filter chain for the request:
 *     INCLUDES XLATEOUT(8859-1->UTS-16)
 *   configured filter chain for the subrequest:
 *     XLATEOUT(8859-1->UTS-16)
 *
 *   When the subrequest is processed, the filter chain will be
 *     XLATEOUT(8859-1->UTS-16) XLATEOUT(8859-1->UTS-16)
 *   This makes no sense, so the instance of XLATEOUT added for the
 *   subrequest will be noop-ed.
 *
 * Example good scenario:
 *
 *   configured filter chain for the request:
 *     INCLUDES XLATEOUT(8859-1->UTS-16)
 *   configured filter chain for the subrequest:
 *     XLATEOUT(IBM-1047->8859-1)
 *
 *   When the subrequest is processed, the filter chain will be
 *     XLATEOUT(IBM-1047->8859-1) XLATEOUT(8859-1->UTS-16)
 *   This makes sense, so the instance of XLATEOUT added for the
 *   subrequest will be left alone and it will translate from
 *   IBM-1047->8859-1.
 */
static void chk_filter_chain(ap_filter_t *f)
{
    ap_filter_t *curf;
    charset_filter_ctx_t *curctx, *last_xlate_ctx = NULL,
        *ctx = f->ctx;
    int output = !strcasecmp(f->frec->name, XLATEOUT_FILTER_NAME);

    if (ctx->noop) {
        return;
    }

    /* walk the filter chain; see if it makes sense for our filter to
     * do any translation
     */
    curf = output ? f->r->output_filters : f->r->input_filters;
    while (curf) {
        if (!strcasecmp(curf->frec->name, f->frec->name) &&
            curf->ctx) {
            curctx = (charset_filter_ctx_t *)curf->ctx;
            if (!last_xlate_ctx) {
                last_xlate_ctx = curctx;
            }
            else {
                if (strcmp(last_xlate_ctx->dc->charset_default,
                           curctx->dc->charset_source)) {
                    /* incompatible translation
                     * if our filter instance is incompatible with an instance
                     * already in place, noop our instance
                     * Notes:
                     * . We are only willing to noop our own instance.
                     * . It is possible to noop another instance which has not
                     *   yet run, but this is not currently implemented.
                     *   Hopefully it will not be needed.
                     * . It is not possible to noop an instance which has
                     *   already run.
                     */
                    if (last_xlate_ctx == f->ctx) {
                        last_xlate_ctx->noop = 1;
                        if (APLOGrtrace1(f->r)) {
                            const char *symbol = output ? "->" : "<-";

                            ap_log_rerror(APLOG_MARK, APLOG_DEBUG,
                                          0, f->r, APLOGNO(01451)
                                          "%s %s - disabling "
                                          "translation %s%s%s; existing "
                                          "translation %s%s%s",
                                          f->r->uri ? "uri" : "file",
                                          f->r->uri ? f->r->uri : f->r->filename,
                                          last_xlate_ctx->dc->charset_source,
                                          symbol,
                                          last_xlate_ctx->dc->charset_default,
                                          curctx->dc->charset_source,
                                          symbol,
                                          curctx->dc->charset_default);
                        }
                    }
                    else {
                        const char *symbol = output ? "->" : "<-";

                        ap_log_rerror(APLOG_MARK, APLOG_ERR,
                                      0, f->r, APLOGNO(01452)
                                      "chk_filter_chain() - can't disable "
                                      "translation %s%s%s; existing "
                                      "translation %s%s%s",
                                      last_xlate_ctx->dc->charset_source,
                                      symbol,
                                      last_xlate_ctx->dc->charset_default,
                                      curctx->dc->charset_source,
                                      symbol,
                                      curctx->dc->charset_default);
                    }
                    break;
                }
            }
        }
        curf = curf->next;
    }
}

/* xlate_brigade() is used to filter request and response bodies
 *
 * we'll stop when one of the following occurs:
 * . we run out of buckets
 * . we run out of space in the output buffer
 * . we hit an error or metadata
 *
 * inputs:
 *   bb:               brigade to process
 *   buffer:           storage to hold the translated characters
 *   buffer_avail:     size of buffer
 *   (and a few more uninteresting parms)
 *
 * outputs:
 *   return value:     APR_SUCCESS or some error code
 *   bb:               we've removed any buckets representing the
 *                     translated characters; the eos bucket, if
 *                     present, will be left in the brigade
 *   buffer:           filled in with translated characters
 *   buffer_avail:     updated with the bytes remaining
 *   hit_eos:          did we hit an EOS bucket?
 */
static apr_status_t xlate_brigade(charset_filter_ctx_t *ctx,
                                  apr_bucket_brigade *bb,
                                  char *buffer,
                                  apr_size_t *buffer_avail,
                                  int *hit_eos)
{
    apr_bucket *b = NULL; /* set to NULL only to quiet some gcc */
    apr_bucket *consumed_bucket;
    const char *bucket;
    apr_size_t bytes_in_bucket; /* total bytes read from current bucket */
    apr_size_t bucket_avail;    /* bytes left in current bucket */
    apr_status_t rv = APR_SUCCESS;

    *hit_eos = 0;
    bucket_avail = 0;
    consumed_bucket = NULL;
    while (1) {
        if (!bucket_avail) { /* no bytes left to process in the current bucket... */
            if (consumed_bucket) {
                apr_bucket_delete(consumed_bucket);
                consumed_bucket = NULL;
            }
            b = APR_BRIGADE_FIRST(bb);
            if (b == APR_BRIGADE_SENTINEL(bb) ||
                APR_BUCKET_IS_METADATA(b)) {
                break;
            }
            rv = apr_bucket_read(b, &bucket, &bytes_in_bucket, APR_BLOCK_READ);
            if (rv != APR_SUCCESS) {
                ctx->ees = EES_BUCKET_READ;
                break;
            }
            bucket_avail = bytes_in_bucket;
            consumed_bucket = b;   /* for axing when we're done reading it */
        }
        if (bucket_avail) {
            /* We've got data, so translate it. */
            if (ctx->saved) {
                /* Rats... we need to finish a partial character from the previous
                 * bucket.
                 *
                 * Strangely, finish_partial_char() increments the input buffer
                 * pointer but does not increment the output buffer pointer.
                 */
                apr_size_t old_buffer_avail = *buffer_avail;
                rv = finish_partial_char(ctx,
                                         &bucket, &bucket_avail,
                                         &buffer, buffer_avail);
                buffer += old_buffer_avail - *buffer_avail;
            }
            else {
                apr_size_t old_buffer_avail = *buffer_avail;
                apr_size_t old_bucket_avail = bucket_avail;
                rv = apr_xlate_conv_buffer(ctx->xlate,
                                           bucket, &bucket_avail,
                                           buffer,
                                           buffer_avail);
                buffer  += old_buffer_avail - *buffer_avail;
                bucket  += old_bucket_avail - bucket_avail;

                if (rv == APR_INCOMPLETE) { /* partial character at end of input */
                    /* We need to save the final byte(s) for next time; we can't
                     * convert it until we look at the next bucket.
                     */
                    rv = set_aside_partial_char(ctx, bucket, bucket_avail);
                    bucket_avail = 0;
                }
            }
            if (rv != APR_SUCCESS) {
                /* bad input byte or partial char too big to store */
                break;
            }
            if (*buffer_avail < XLATE_MIN_BUFF_LEFT) {
                /* if any data remains in the current bucket, split there */
                if (bucket_avail) {
                    apr_bucket_split(b, bytes_in_bucket - bucket_avail);
                }
                apr_bucket_delete(b);
                break;
            }
        }
    }

    if (!APR_BRIGADE_EMPTY(bb)) {
        b = APR_BRIGADE_FIRST(bb);
        if (APR_BUCKET_IS_EOS(b)) {
            /* Leave the eos bucket in the brigade for reporting to
             * subsequent filters.
             */
            *hit_eos = 1;
            if (ctx->saved) {
                /* Oops... we have a partial char from the previous bucket
                 * that won't be completed because there's no more data.
                 */
                rv = APR_INCOMPLETE;
                ctx->ees = EES_INCOMPLETE_CHAR;
            }
        }
    }

    return rv;
}

/* xlate_out_filter() handles (almost) arbitrary conversions from one charset
 * to another...
 * translation is determined in the fixup hook (find_code_page), which is
 * where the filter's context data is set up... the context data gives us
 * the translation handle
 */
static apr_status_t xlate_out_filter(ap_filter_t *f, apr_bucket_brigade *bb)
{
    charset_req_t *reqinfo = ap_get_module_config(f->r->request_config,
                                                  &charset_lite_module);
    charset_dir_t *dc = ap_get_module_config(f->r->per_dir_config,
                                             &charset_lite_module);
    charset_filter_ctx_t *ctx = f->ctx;
    apr_bucket *dptr, *consumed_bucket;
    const char *cur_str;
    apr_size_t cur_len, cur_avail;
    char tmp[OUTPUT_XLATE_BUF_SIZE];
    apr_size_t space_avail;
    int done;
    apr_status_t rv = APR_SUCCESS;

    if (!ctx) {
        /* this is SetOutputFilter path; grab the preallocated context,
         * if any; note that if we decided not to do anything in an earlier
         * handler, we won't even have a reqinfo
         */
        if (reqinfo) {
            ctx = f->ctx = reqinfo->output_ctx;
            reqinfo->output_ctx = NULL; /* prevent SNAFU if user coded us twice
                                         * in the filter chain; we can't have two
                                         * instances using the same context
                                         */
        }
        if (!ctx) {                   /* no idea how to translate; don't do anything */
            ctx = f->ctx = apr_pcalloc(f->r->pool, sizeof(charset_filter_ctx_t));
            ctx->dc = dc;
            ctx->noop = 1;
        }
    }

    /* Check the mime type to see if translation should be performed.
     */
    if (!ctx->noop && ctx->xlate == NULL) {
        const char *mime_type = f->r->content_type;

        if (mime_type && (ap_cstr_casecmpn(mime_type, "text/", 5) == 0 ||
#if APR_CHARSET_EBCDIC
        /* On an EBCDIC machine, be willing to translate mod_autoindex-
         * generated output.  Otherwise, it doesn't look too cool.
         *
         * XXX This isn't a perfect fix because this doesn't trigger us
         * to convert from the charset of the source code to ASCII.  The
         * general solution seems to be to allow a generator to set an
         * indicator in the r specifying that the body is coded in the
         * implementation character set (i.e., the charset of the source
         * code).  This would get several different types of documents
         * translated properly: mod_autoindex output, mod_status output,
         * mod_info output, hard-coded error documents, etc.
         */
            strcmp(mime_type, DIR_MAGIC_TYPE) == 0 ||
#endif
            ap_cstr_casecmpn(mime_type, "message/", 8) == 0 ||
            dc->force_xlate == FX_FORCE)) {

            rv = apr_xlate_open(&ctx->xlate,
                                dc->charset_default, dc->charset_source, f->r->pool);
            if (rv != APR_SUCCESS) {
                ap_log_rerror(APLOG_MARK, APLOG_ERR, rv, f->r, APLOGNO(01453)
                              "can't open translation %s->%s",
                              dc->charset_source, dc->charset_default);
                ctx->noop = 1;
            }
            else {
                if (apr_xlate_sb_get(ctx->xlate, &ctx->is_sb) != APR_SUCCESS) {
                    ctx->is_sb = 0;
                }
            }
        }
        else {
            ctx->noop = 1;
            if (mime_type) {
                ap_log_rerror(APLOG_MARK, APLOG_TRACE6, 0, f->r,
                              "mime type is %s; no translation selected",
                              mime_type);
            }
        }
    }

    ap_log_rerror(APLOG_MARK, APLOG_TRACE6, 0, f->r,
                  "xlate_out_filter() - "
                  "charset_source: %s charset_default: %s",
                  dc && dc->charset_source ? dc->charset_source : "(none)",
                  dc && dc->charset_default ? dc->charset_default : "(none)");

    if (!ctx->ran) {  /* filter never ran before */
        chk_filter_chain(f);
        ctx->ran = 1;
        if (!ctx->noop && !ctx->is_sb) {
            /* We're not converting between two single-byte charsets, so unset
             * Content-Length since it is unlikely to remain the same.
             */
            apr_table_unset(f->r->headers_out, "Content-Length");
        }
    }

    if (ctx->noop) {
        return ap_pass_brigade(f->next, bb);
    }

    dptr = APR_BRIGADE_FIRST(bb);
    done = 0;
    cur_len = 0;
    space_avail = sizeof(tmp);
    consumed_bucket = NULL;
    while (!done) {
        if (!cur_len) { /* no bytes left to process in the current bucket... */
            if (consumed_bucket) {
                apr_bucket_delete(consumed_bucket);
                consumed_bucket = NULL;
            }
            if (dptr == APR_BRIGADE_SENTINEL(bb)) {
                break;
            }
            if (APR_BUCKET_IS_EOS(dptr)) {
                cur_len = -1; /* XXX yuck, but that tells us to send
                                 * eos down; when we minimize our bb construction
                                 * we'll fix this crap */
                if (ctx->saved) {
                    /* Oops... we have a partial char from the previous bucket
                     * that won't be completed because there's no more data.
                     */
                    rv = APR_INCOMPLETE;
                    ctx->ees = EES_INCOMPLETE_CHAR;
                }
                break;
            }
            if (APR_BUCKET_IS_METADATA(dptr)) {
                apr_bucket *metadata_bucket;
                metadata_bucket = dptr;
                dptr = APR_BUCKET_NEXT(dptr);
                APR_BUCKET_REMOVE(metadata_bucket);
                rv = send_bucket_downstream(f, metadata_bucket);
                if (rv != APR_SUCCESS) {
                    done = 1;
                }
                continue;
            }
            rv = apr_bucket_read(dptr, &cur_str, &cur_len, APR_BLOCK_READ);
            if (rv != APR_SUCCESS) {
                ctx->ees = EES_BUCKET_READ;
                break;
            }
            consumed_bucket = dptr; /* for axing when we're done reading it */
            dptr = APR_BUCKET_NEXT(dptr); /* get ready for when we access the
                                          * next bucket */
        }
        /* Try to fill up our tmp buffer with translated data. */
        cur_avail = cur_len;

        if (cur_len) { /* maybe we just hit the end of a pipe (len = 0) ? */
            if (ctx->saved) {
                /* Rats... we need to finish a partial character from the previous
                 * bucket.
                 */
                char *tmp_tmp;

                tmp_tmp = tmp + sizeof(tmp) - space_avail;
                rv = finish_partial_char(ctx,
                                         &cur_str, &cur_len,
                                         &tmp_tmp, &space_avail);
            }
            else {
                rv = apr_xlate_conv_buffer(ctx->xlate,
                                           cur_str, &cur_avail,
                                           tmp + sizeof(tmp) - space_avail, &space_avail);

                /* Update input ptr and len after consuming some bytes */
                cur_str += cur_len - cur_avail;
                cur_len = cur_avail;

                if (rv == APR_INCOMPLETE) { /* partial character at end of input */
                    /* We need to save the final byte(s) for next time; we can't
                     * convert it until we look at the next bucket.
                     */
                    rv = set_aside_partial_char(ctx, cur_str, cur_len);
                    cur_len = 0;
                }
            }
        }

        if (rv != APR_SUCCESS) {
            /* bad input byte or partial char too big to store */
            done = 1;
        }

        if (space_avail < XLATE_MIN_BUFF_LEFT) {
            /* It is time to flush, as there is not enough space left in the
             * current output buffer to bother with converting more data.
             */
            rv = send_downstream(f, tmp, sizeof(tmp) - space_avail);
            if (rv != APR_SUCCESS) {
                done = 1;
            }

            /* tmp is now empty */
            space_avail = sizeof(tmp);
        }
    }

    if (rv == APR_SUCCESS) {
        if (space_avail < sizeof(tmp)) { /* gotta write out what we converted */
            rv = send_downstream(f, tmp, sizeof(tmp) - space_avail);
        }
    }
    if (rv == APR_SUCCESS) {
        if (cur_len == -1) {
            rv = send_eos(f);
        }
    }
    else {
        log_xlate_error(f, rv);
    }

    return rv;
}

static apr_status_t xlate_in_filter(ap_filter_t *f, apr_bucket_brigade *bb,
                                    ap_input_mode_t mode, apr_read_type_e block,
                                    apr_off_t readbytes)
{
    apr_status_t rv;
    charset_req_t *reqinfo = ap_get_module_config(f->r->request_config,
                                                  &charset_lite_module);
    charset_dir_t *dc = ap_get_module_config(f->r->per_dir_config,
                                             &charset_lite_module);
    charset_filter_ctx_t *ctx = f->ctx;
    apr_size_t buffer_size;
    int hit_eos;

    /* just get out of the way of things we don't want. */
    if (mode != AP_MODE_READBYTES) {
        return ap_get_brigade(f->next, bb, mode, block, readbytes);
    }

    if (!ctx) {
        /* this is SetInputFilter path; grab the preallocated context,
         * if any; note that if we decided not to do anything in an earlier
         * handler, we won't even have a reqinfo
         */
        if (reqinfo) {
            ctx = f->ctx = reqinfo->input_ctx;
            reqinfo->input_ctx = NULL; /* prevent SNAFU if user coded us twice
                                        * in the filter chain; we can't have two
                                        * instances using the same context
                                        */
        }
        if (!ctx) {                   /* no idea how to translate; don't do anything */
            ctx = f->ctx = apr_pcalloc(f->r->pool, sizeof(charset_filter_ctx_t));
            ctx->dc = dc;
            ctx->noop = 1;
        }
    }

    ap_log_rerror(APLOG_MARK, APLOG_TRACE6, 0, f->r,
                 "xlate_in_filter() - "
                 "charset_source: %s charset_default: %s",
                 dc && dc->charset_source ? dc->charset_source : "(none)",
                 dc && dc->charset_default ? dc->charset_default : "(none)");

    if (!ctx->ran) {  /* filter never ran before */
        chk_filter_chain(f);
        ctx->ran = 1;
        if (!ctx->noop && !ctx->is_sb
            && apr_table_get(f->r->headers_in, "Content-Length")) {
            /* A Content-Length header is present, but it won't be valid after
             * conversion because we're not converting between two single-byte
             * charsets.  This will affect most CGI scripts and may affect
             * some modules.
             * Content-Length can't be unset here because that would break
             * being able to read the request body.
             * Processing of chunked request bodies is not impacted by this
             * filter since the length was not declared anyway.
             */
            ap_log_rerror(APLOG_MARK, APLOG_TRACE1, 0, f->r,
                          "Request body length may change, resulting in "
                          "misprocessing by some modules or scripts");
        }
    }

    if (ctx->noop) {
        return ap_get_brigade(f->next, bb, mode, block, readbytes);
    }

    if (APR_BRIGADE_EMPTY(ctx->bb)) {
        if ((rv = ap_get_brigade(f->next, bb, mode, block,
                                 readbytes)) != APR_SUCCESS) {
            return rv;
        }
    }
    else {
        APR_BRIGADE_PREPEND(bb, ctx->bb); /* first use the leftovers */
    }

    buffer_size = INPUT_XLATE_BUF_SIZE;
    rv = xlate_brigade(ctx, bb, ctx->tmp, &buffer_size, &hit_eos);
    if (rv == APR_SUCCESS) {
        if (!hit_eos) {
            /* move anything leftover into our context for next time;
             * we don't currently "set aside" since the data came from
             * down below, but I suspect that for long-term we need to
             * do that
             */
            APR_BRIGADE_CONCAT(ctx->bb, bb);
        }
        if (buffer_size < INPUT_XLATE_BUF_SIZE) { /* do we have output? */
            apr_bucket *e;

            e = apr_bucket_heap_create(ctx->tmp,
                                       INPUT_XLATE_BUF_SIZE - buffer_size,
                                       NULL, f->r->connection->bucket_alloc);
            /* make sure we insert at the head, because there may be
             * an eos bucket already there, and the eos bucket should
             * come after the data
             */
            APR_BRIGADE_INSERT_HEAD(bb, e);
        }
        else {
            /* XXX need to get some more data... what if the last brigade
             * we got had only the first byte of a multibyte char?  we need
             * to grab more data from the network instead of returning an
             * empty brigade
             */
        }
        /* If we have any metadata at the head of ctx->bb, go ahead and move it
         * onto the end of bb to be returned to our caller.
         */
        if (!APR_BRIGADE_EMPTY(ctx->bb)) {
            apr_bucket *b = APR_BRIGADE_FIRST(ctx->bb);
            while (b != APR_BRIGADE_SENTINEL(ctx->bb)
                   && APR_BUCKET_IS_METADATA(b)) {
                APR_BUCKET_REMOVE(b);
                APR_BRIGADE_INSERT_TAIL(bb, b);
                b = APR_BRIGADE_FIRST(ctx->bb);
            }
        }
    }
    else {
        log_xlate_error(f, rv);
    }

    return rv;
}

static const command_rec cmds[] =
{
    AP_INIT_TAKE1("CharsetSourceEnc",
                  add_charset_source,
                  NULL,
                  OR_FILEINFO,
                  "source (html,cgi,ssi) file charset"),
    AP_INIT_TAKE1("CharsetDefault",
                  add_charset_default,
                  NULL,
                  OR_FILEINFO,
                  "name of default charset"),
    AP_INIT_ITERATE("CharsetOptions",
                    add_charset_options,
                    NULL,
                    OR_FILEINFO,
                    "valid options: ImplicitAdd, NoImplicitAdd, TranslateAllMimeTypes, "
                    "NoTranslateAllMimeTypes"),
    {NULL}
};

static void charset_register_hooks(apr_pool_t *p)
{
    ap_hook_fixups(find_code_page, NULL, NULL, APR_HOOK_MIDDLE);
    ap_hook_insert_filter(xlate_insert_filter, NULL, NULL, APR_HOOK_REALLY_LAST);
    ap_register_output_filter(XLATEOUT_FILTER_NAME, xlate_out_filter, NULL,
                              AP_FTYPE_RESOURCE);
    ap_register_input_filter(XLATEIN_FILTER_NAME, xlate_in_filter, NULL,
                             AP_FTYPE_RESOURCE);
}

AP_DECLARE_MODULE(charset_lite) =
{
    STANDARD20_MODULE_STUFF,
    create_charset_dir_conf,
    merge_charset_dir_conf,
    NULL,
    NULL,
    cmds,
    charset_register_hooks
};