comparison src/http/modules/ngx_http_charset_filter_module.c @ 206:3866d57d9cfd NGINX_0_3_50

nginx 0.3.50 *) Change: the "proxy_redirect_errors" and "fastcgi_redirect_errors" directives was renamed to the "proxy_intercept_errors" and "fastcgi_intercept_errors" directives. *) Feature: the ngx_http_charset_module supports the recoding from the single byte encodings to the UTF-8 encoding and back. *) Feature: the "X-Accel-Charset" response header line is supported in proxy and FastCGI mode. *) Bugfix: the "\" escape symbol in the "\"" and "\'" pairs in the SSI command was removed only if the command also has the "$" symbol. *) Bugfix: the "<!--" string might be added on some conditions in the SSI after inclusion. *) Bugfix: if the "Content-Length: 0" header line was in response, then in nonbuffered proxying mode the client connection was not closed.
author Igor Sysoev <http://sysoev.ru>
date Wed, 28 Jun 2006 00:00:00 +0400
parents ca5f86d94316
children b12b3b1a9426
comparison
equal deleted inserted replaced
205:e53bd15c244a 206:3866d57d9cfd
7 #include <ngx_config.h> 7 #include <ngx_config.h>
8 #include <ngx_core.h> 8 #include <ngx_core.h>
9 #include <ngx_http.h> 9 #include <ngx_http.h>
10 10
11 11
12 #define NGX_HTTP_NO_CHARSET -2 12 #define NGX_HTTP_NO_CHARSET -2
13
14 /* 1 byte length and up to 3 bytes for the UTF-8 encoding of the UCS-2 */
15 #define NGX_UTF_LEN 4
16
17 #define NGX_HTML_ENTITY_LEN (sizeof("&#1114111;") - 1)
13 18
14 19
15 typedef struct { 20 typedef struct {
16 u_char **tables; 21 u_char **tables;
17 ngx_str_t name; 22 ngx_str_t name;
18 23
19 ngx_uint_t utf8; /* unsigned utf8:1; */ 24 unsigned length:16;
25 unsigned utf8:1;
20 } ngx_http_charset_t; 26 } ngx_http_charset_t;
21 27
22 28
23 typedef struct { 29 typedef struct {
24 ngx_int_t src; 30 ngx_int_t src;
25 ngx_int_t dst; 31 ngx_int_t dst;
26 } ngx_http_charset_recode_t; 32 } ngx_http_charset_recode_t;
27 33
28 34
29 typedef struct { 35 typedef struct {
30 ngx_int_t src; 36 ngx_int_t src;
31 ngx_int_t dst; 37 ngx_int_t dst;
32 u_char *src2dst; 38 u_char *src2dst;
33 u_char *dst2src; 39 u_char *dst2src;
34 } ngx_http_charset_tables_t; 40 } ngx_http_charset_tables_t;
35 41
36 42
37 typedef struct { 43 typedef struct {
38 ngx_array_t charsets; /* ngx_http_charset_t */ 44 ngx_array_t charsets; /* ngx_http_charset_t */
39 ngx_array_t tables; /* ngx_http_charset_tables_t */ 45 ngx_array_t tables; /* ngx_http_charset_tables_t */
40 ngx_array_t recodes; /* ngx_http_charset_recode_t */ 46 ngx_array_t recodes; /* ngx_http_charset_recode_t */
41 } ngx_http_charset_main_conf_t; 47 } ngx_http_charset_main_conf_t;
42 48
43 49
44 typedef struct { 50 typedef struct {
45 ngx_int_t charset; 51 ngx_int_t charset;
46 ngx_int_t source_charset; 52 ngx_int_t source_charset;
47 ngx_flag_t override_charset; 53 ngx_flag_t override_charset;
48 } ngx_http_charset_loc_conf_t; 54 } ngx_http_charset_loc_conf_t;
49 55
50 56
51 typedef struct { 57 typedef struct {
52 u_char *table; 58 u_char *table;
53 ngx_int_t charset; 59 ngx_int_t charset;
60
61 ngx_chain_t *busy;
62 ngx_chain_t *free_bufs;
63 ngx_chain_t *free_buffers;
64
65 size_t saved_len;
66 u_char saved[NGX_UTF_LEN];
67
68 unsigned length:16;
69 unsigned from_utf8:1;
70 unsigned to_utf8:1;
54 } ngx_http_charset_ctx_t; 71 } ngx_http_charset_ctx_t;
55 72
56 73
74 typedef struct {
75 ngx_http_charset_tables_t *table;
76 ngx_http_charset_t *charset;
77 ngx_uint_t characters;
78 } ngx_http_charset_conf_ctx_t;
79
80
81 static ngx_int_t ngx_http_charset_get_charset(ngx_http_charset_t *charsets,
82 ngx_uint_t n, u_char *charset);
83 static ngx_int_t ngx_http_charset_set_charset(ngx_http_request_t *r,
84 ngx_http_charset_t *charsets, ngx_int_t charset, ngx_int_t source_charset);
57 static ngx_uint_t ngx_http_charset_recode(ngx_buf_t *b, u_char *table); 85 static ngx_uint_t ngx_http_charset_recode(ngx_buf_t *b, u_char *table);
58 86 static ngx_chain_t *ngx_http_charset_recode_from_utf8(ngx_pool_t *pool,
59 static char *ngx_charset_map_block(ngx_conf_t *cf, ngx_command_t *cmd, 87 ngx_buf_t *buf, ngx_http_charset_ctx_t *ctx);
88 static ngx_chain_t *ngx_http_charset_recode_to_utf8(ngx_pool_t *pool,
89 ngx_buf_t *buf, ngx_http_charset_ctx_t *ctx);
90
91 static ngx_chain_t *ngx_http_charset_get_buf(ngx_pool_t *pool,
92 ngx_http_charset_ctx_t *ctx);
93 static ngx_chain_t *ngx_http_charset_get_buffer(ngx_pool_t *pool,
94 ngx_http_charset_ctx_t *ctx, size_t size);
95
96 static char *ngx_http_charset_map_block(ngx_conf_t *cf, ngx_command_t *cmd,
60 void *conf); 97 void *conf);
61 static char *ngx_charset_map(ngx_conf_t *cf, ngx_command_t *dummy, void *conf); 98 static char *ngx_http_charset_map(ngx_conf_t *cf, ngx_command_t *dummy,
99 void *conf);
62 100
63 static char *ngx_http_set_charset_slot(ngx_conf_t *cf, ngx_command_t *cmd, 101 static char *ngx_http_set_charset_slot(ngx_conf_t *cf, ngx_command_t *cmd,
64 void *conf); 102 void *conf);
65 static ngx_int_t ngx_http_add_charset(ngx_array_t *charsets, ngx_str_t *name); 103 static ngx_int_t ngx_http_add_charset(ngx_array_t *charsets, ngx_str_t *name);
66 104
99 offsetof(ngx_http_charset_loc_conf_t, override_charset), 137 offsetof(ngx_http_charset_loc_conf_t, override_charset),
100 NULL }, 138 NULL },
101 139
102 { ngx_string("charset_map"), 140 { ngx_string("charset_map"),
103 NGX_HTTP_MAIN_CONF|NGX_CONF_BLOCK|NGX_CONF_TAKE2, 141 NGX_HTTP_MAIN_CONF|NGX_CONF_BLOCK|NGX_CONF_TAKE2,
104 ngx_charset_map_block, 142 ngx_http_charset_map_block,
105 NGX_HTTP_MAIN_CONF_OFFSET, 143 NGX_HTTP_MAIN_CONF_OFFSET,
106 0, 144 0,
107 NULL }, 145 NULL },
108 146
109 ngx_null_command 147 ngx_null_command
146 184
147 185
148 static ngx_int_t 186 static ngx_int_t
149 ngx_http_charset_header_filter(ngx_http_request_t *r) 187 ngx_http_charset_header_filter(ngx_http_request_t *r)
150 { 188 {
151 size_t len; 189 u_char *ct;
152 u_char *p;
153 ngx_int_t charset, source_charset; 190 ngx_int_t charset, source_charset;
154 ngx_uint_t i; 191 ngx_str_t *mc;
192 ngx_uint_t n;
155 ngx_http_charset_t *charsets; 193 ngx_http_charset_t *charsets;
156 ngx_http_charset_ctx_t *ctx; 194 ngx_http_charset_ctx_t *ctx;
157 ngx_http_charset_loc_conf_t *lcf, *mlcf; 195 ngx_http_charset_loc_conf_t *lcf, *mlcf;
158 ngx_http_charset_main_conf_t *mcf; 196 ngx_http_charset_main_conf_t *mcf;
159 197
160 mcf = ngx_http_get_module_main_conf(r, ngx_http_charset_filter_module); 198 mcf = ngx_http_get_module_main_conf(r, ngx_http_charset_filter_module);
161 199
162 ctx = ngx_http_get_module_ctx(r->main, ngx_http_charset_filter_module); 200 charsets = mcf->charsets.elts;
163 201 n = mcf->charsets.nelts;
164 if (ctx == NULL) { 202
165 mlcf = ngx_http_get_module_loc_conf(r->main, 203 /* destination charset */
166 ngx_http_charset_filter_module); 204
167 charset = mlcf->charset; 205 if (r == r->main) {
206
207 if (r->headers_out.content_type.len == 0) {
208 return ngx_http_next_header_filter(r);
209 }
210
211 if (r->headers_out.override_charset
212 && r->headers_out.override_charset->len)
213 {
214 charset = ngx_http_charset_get_charset(charsets, n,
215 r->headers_out.override_charset->data);
216
217 if (charset == NGX_HTTP_NO_CHARSET) {
218 ngx_log_error(NGX_LOG_ALERT, r->connection->log, 0,
219 "unknown charset \"%V\" to override",
220 &r->headers_out.override_charset);
221
222 return ngx_http_next_header_filter(r);
223 }
224
225 } else {
226 mlcf = ngx_http_get_module_loc_conf(r,
227 ngx_http_charset_filter_module);
228 charset = mlcf->charset;
229
230 if (charset == NGX_HTTP_NO_CHARSET) {
231 return ngx_http_next_header_filter(r);
232 }
233
234 if (r->headers_out.charset.len) {
235 if (mlcf->override_charset == 0) {
236 return ngx_http_next_header_filter(r);
237 }
238
239 } else {
240 ct = r->headers_out.content_type.data;
241
242 if (ngx_strncasecmp(ct, "text/", 5) != 0
243 && ngx_strncasecmp(ct, "application/x-javascript", 24) != 0)
244 {
245 return ngx_http_next_header_filter(r);
246 }
247 }
248 }
249
250 } else {
251 ctx = ngx_http_get_module_ctx(r->main, ngx_http_charset_filter_module);
252
253 if (ctx == NULL) {
254
255 mc = &r->main->headers_out.charset;
256
257 if (mc->len == 0) {
258 return ngx_http_next_header_filter(r);
259 }
260
261 ctx = ngx_pcalloc(r->pool, sizeof(ngx_http_charset_ctx_t));
262 if (ctx == NULL) {
263 return NGX_ERROR;
264 }
265
266 ngx_http_set_ctx(r->main, ctx, ngx_http_charset_filter_module);
267
268 charset = ngx_http_charset_get_charset(charsets, n, mc->data);
269
270 ctx->charset = charset;
271
272 if (charset == NGX_HTTP_NO_CHARSET) {
273 ngx_log_error(NGX_LOG_ALERT, r->connection->log, 0,
274 "unknown charset \"%V\" of main request", mc);
275
276 return ngx_http_next_header_filter(r);
277 }
278 }
279
280 charset = ctx->charset;
168 281
169 if (charset == NGX_HTTP_NO_CHARSET) { 282 if (charset == NGX_HTTP_NO_CHARSET) {
170 return ngx_http_next_header_filter(r); 283 return ngx_http_next_header_filter(r);
171 } 284 }
172 285 }
173 } else { 286
174 charset = ctx->charset; 287 /* source charset */
175 } 288
176 289 if (r->headers_out.charset.len == 0) {
177 charsets = mcf->charsets.elts; 290 lcf = ngx_http_get_module_loc_conf(r, ngx_http_charset_filter_module);
178 291
179 if (r == r->main) { 292 return ngx_http_charset_set_charset(r, mcf->charsets.elts, charset,
180 if (r->headers_out.content_type.len == 0) { 293 lcf->source_charset);
181 return ngx_http_next_header_filter(r); 294 }
182 } 295
183 296 source_charset = ngx_http_charset_get_charset(charsets, n,
184 if (ngx_strncasecmp(r->headers_out.content_type.data, "text/", 5) != 0 297 r->headers_out.charset.data);
185 && ngx_strncasecmp(r->headers_out.content_type.data, 298
186 "application/x-javascript", 24) != 0) 299 if (source_charset == NGX_HTTP_NO_CHARSET) {
187 { 300 ngx_log_error(NGX_LOG_ALERT, r->connection->log, 0,
188 return ngx_http_next_header_filter(r); 301 "unknown source charset \"%V\"", &r->headers_out.charset);
189 }
190
191 } else {
192 if (r->headers_out.content_type.len == 0) {
193 mlcf = ngx_http_get_module_loc_conf(r->main,
194 ngx_http_charset_filter_module);
195 source_charset = mlcf->source_charset;
196
197 goto found;
198 }
199 }
200
201 lcf = ngx_http_get_module_loc_conf(r, ngx_http_charset_filter_module);
202
203 len = 0;
204
205 for (p = r->headers_out.content_type.data; *p; p++) {
206 if (*p == ';') {
207 len = p - r->headers_out.content_type.data;
208 }
209
210 if (ngx_strncasecmp(p, "charset=", 8) != 0) {
211 continue;
212 }
213
214 p += 8;
215
216 for (i = 0; i < mcf->charsets.nelts; i++) {
217
218 if (ngx_strcasecmp(p, charsets[i].name.data) == 0) {
219
220 if (r == r->main && lcf->override_charset == 0) {
221 ctx = ngx_pcalloc(r->pool, sizeof(ngx_http_charset_ctx_t));
222 if (ctx == NULL) {
223 return NGX_ERROR;
224 }
225
226 ngx_http_set_ctx(r, ctx, ngx_http_charset_filter_module);
227
228 ctx->charset = i;
229
230 return ngx_http_next_header_filter(r);
231 }
232
233 if (i != (ngx_uint_t) charset
234 && (charsets[i].tables == NULL
235 || charsets[i].tables[charset] == NULL))
236 {
237 ngx_log_error(NGX_LOG_ALERT, r->connection->log, 0,
238 "no \"charset_map\" between the charsets "
239 "\"%V\" and \"%V\"",
240 &charsets[i].name, &charsets[charset].name);
241
242 return ngx_http_next_header_filter(r);
243 }
244
245 r->headers_out.content_type.len = len;
246
247 if (r->headers_out.status == NGX_HTTP_MOVED_PERMANENTLY
248 || r->headers_out.status == NGX_HTTP_MOVED_TEMPORARILY)
249 {
250 /*
251 * do not set charset for the redirect because NN 4.x
252 * uses this charset instead of the next page charset
253 */
254
255 r->headers_out.charset.len = 0;
256 return ngx_http_next_header_filter(r);
257 }
258
259 source_charset = i;
260
261 goto found;
262 }
263 }
264 302
265 return ngx_http_next_header_filter(r); 303 return ngx_http_next_header_filter(r);
266 } 304 }
305
306 if (source_charset != charset
307 && (charsets[source_charset].tables == NULL
308 || charsets[source_charset].tables[charset] == NULL))
309 {
310 ngx_log_error(NGX_LOG_ALERT, r->connection->log, 0,
311 "no \"charset_map\" between the charsets "
312 "\"%V\" and \"%V\"",
313 &charsets[source_charset].name, &charsets[charset].name);
314
315 return ngx_http_next_header_filter(r);
316 }
317
318 r->headers_out.content_type.len = r->headers_out.content_type_len;
319
320 return ngx_http_charset_set_charset(r, mcf->charsets.elts, charset,
321 source_charset);
322 }
323
324
325 static ngx_int_t
326 ngx_http_charset_get_charset(ngx_http_charset_t *charsets, ngx_uint_t n,
327 u_char *charset)
328 {
329 ngx_uint_t i;
330
331 for (i = 0; i < n; i++) {
332 if (ngx_strcasecmp(charsets[i].name.data, charset) == 0) {
333 return i;
334 }
335 }
336
337 return NGX_HTTP_NO_CHARSET;
338 }
339
340
341 static ngx_int_t
342 ngx_http_charset_set_charset(ngx_http_request_t *r,
343 ngx_http_charset_t *charsets, ngx_int_t charset, ngx_int_t source_charset)
344 {
345 ngx_http_charset_ctx_t *ctx;
267 346
268 if (r->headers_out.status == NGX_HTTP_MOVED_PERMANENTLY 347 if (r->headers_out.status == NGX_HTTP_MOVED_PERMANENTLY
269 || r->headers_out.status == NGX_HTTP_MOVED_TEMPORARILY) 348 || r->headers_out.status == NGX_HTTP_MOVED_TEMPORARILY)
270 { 349 {
271 /* 350 /*
272 * do not set charset for the redirect because NN 4.x 351 * do not set charset for the redirect because NN 4.x
273 * use this charset instead of the next page charset 352 * use this charset instead of the next page charset
274 */ 353 */
275 354
276 r->headers_out.charset.len = 0; 355 r->headers_out.charset.len = 0;
356
277 return ngx_http_next_header_filter(r); 357 return ngx_http_next_header_filter(r);
278 } 358 }
279
280 if (r->headers_out.charset.len) {
281 return ngx_http_next_header_filter(r);
282 }
283
284 source_charset = lcf->source_charset;
285
286 found:
287 359
288 r->headers_out.charset = charsets[charset].name; 360 r->headers_out.charset = charsets[charset].name;
289 r->utf8 = charsets[charset].utf8; 361 r->utf8 = charsets[charset].utf8;
290 362
291 if (source_charset == NGX_CONF_UNSET || source_charset == charset) { 363 if (source_charset == NGX_CONF_UNSET || source_charset == charset) {
299 371
300 ngx_http_set_ctx(r, ctx, ngx_http_charset_filter_module); 372 ngx_http_set_ctx(r, ctx, ngx_http_charset_filter_module);
301 373
302 ctx->table = charsets[source_charset].tables[charset]; 374 ctx->table = charsets[source_charset].tables[charset];
303 ctx->charset = charset; 375 ctx->charset = charset;
376 ctx->length = charsets[charset].length;
377 ctx->from_utf8 = charsets[source_charset].utf8;
378 ctx->to_utf8 = charsets[charset].utf8;
379
380 if ((ctx->to_utf8 || ctx->from_utf8) && r == r->main) {
381 ngx_http_clear_content_length(r);
382 }
304 383
305 r->filter_need_in_memory = 1; 384 r->filter_need_in_memory = 1;
306 385
307 return ngx_http_next_header_filter(r); 386 return ngx_http_next_header_filter(r);
308 } 387 }
309 388
310 389
311 static ngx_int_t 390 static ngx_int_t
312 ngx_http_charset_body_filter(ngx_http_request_t *r, ngx_chain_t *in) 391 ngx_http_charset_body_filter(ngx_http_request_t *r, ngx_chain_t *in)
313 { 392 {
314 ngx_chain_t *cl; 393 ngx_int_t rc;
394 ngx_buf_t *b;
395 ngx_chain_t *cl, *out, **ll;
315 ngx_http_charset_ctx_t *ctx; 396 ngx_http_charset_ctx_t *ctx;
316 397
317 ctx = ngx_http_get_module_ctx(r, ngx_http_charset_filter_module); 398 ctx = ngx_http_get_module_ctx(r, ngx_http_charset_filter_module);
318 399
319 if (ctx == NULL || ctx->table == NULL) { 400 if (ctx == NULL || ctx->table == NULL) {
320 return ngx_http_next_body_filter(r, in); 401 return ngx_http_next_body_filter(r, in);
402 }
403
404 if ((ctx->to_utf8 || ctx->from_utf8) || ctx->busy) {
405
406 out = NULL;
407 ll = &out;
408
409 for (cl = in; cl; cl = cl->next) {
410 b = cl->buf;
411
412 if (ngx_buf_size(b) == 0) {
413 continue;
414 }
415
416 if (ctx->to_utf8) {
417 *ll = ngx_http_charset_recode_to_utf8(r->pool, b, ctx);
418
419 } else {
420 *ll = ngx_http_charset_recode_from_utf8(r->pool, b, ctx);
421 }
422
423 if (*ll == NULL) {
424 return NGX_ERROR;
425 }
426
427 while (*ll) {
428 ll = &(*ll)->next;
429 }
430 }
431
432 rc = ngx_http_next_body_filter(r, out);
433
434 if (out) {
435 if (ctx->busy == NULL) {
436 ctx->busy = out;
437
438 } else {
439 for (cl = ctx->busy; cl->next; cl = cl->next) { /* void */ }
440 cl->next = out;
441 }
442 }
443
444 while (ctx->busy) {
445
446 cl = ctx->busy;
447 b = cl->buf;
448
449 if (ngx_buf_size(b) != 0) {
450 break;
451 }
452
453 #if (NGX_HAVE_WRITE_ZEROCOPY)
454 if (b->zerocopy_busy) {
455 break;
456 }
457 #endif
458
459 ctx->busy = cl->next;
460
461 if (b->tag != (ngx_buf_tag_t) &ngx_http_charset_filter_module) {
462 continue;
463 }
464
465 if (b->shadow) {
466 b->shadow->pos = b->shadow->last;
467 }
468
469 if (b->pos) {
470 cl->next = ctx->free_buffers;
471 ctx->free_buffers = cl;
472 continue;
473 }
474
475 cl->next = ctx->free_bufs;
476 ctx->free_bufs = cl;
477 }
478
479 return rc;
321 } 480 }
322 481
323 for (cl = in; cl; cl = cl->next) { 482 for (cl = in; cl; cl = cl->next) {
324 (void) ngx_http_charset_recode(cl->buf, ctx->table); 483 (void) ngx_http_charset_recode(cl->buf, ctx->table);
325 } 484 }
351 510
352 return 0; 511 return 0;
353 } 512 }
354 513
355 514
515 static ngx_chain_t *
516 ngx_http_charset_recode_from_utf8(ngx_pool_t *pool, ngx_buf_t *buf,
517 ngx_http_charset_ctx_t *ctx)
518 {
519 size_t len, size;
520 u_char c, *p, *src, *dst, *saved, **table;
521 uint32_t n;
522 ngx_buf_t *b;
523 ngx_uint_t i;
524 ngx_chain_t *out, *cl, **ll;
525
526 src = buf->pos;
527
528 if (ctx->saved_len == 0) {
529
530 for ( /* void */ ; src < buf->last; src++) {
531
532 if (*src < 0x80) {
533 continue;
534 }
535
536 len = src - buf->pos;
537
538 if (len > 512) {
539 out = ngx_http_charset_get_buf(pool, ctx);
540 if (out == NULL) {
541 return NULL;
542 }
543
544 b = out->buf;
545
546 b->temporary = buf->temporary;
547 b->memory = buf->memory;
548 b->mmap = buf->mmap;
549 b->flush = buf->flush;
550
551 b->pos = buf->pos;
552 b->last = src;
553
554 out->buf = b;
555 out->next = NULL;
556
557 size = buf->last - src;
558
559 saved = src;
560 n = ngx_utf_decode(&saved, size);
561
562 if (n == 0xfffffffe) {
563 /* incomplete UTF-8 symbol */
564
565 ngx_memcpy(ctx->saved, src, size);
566 ctx->saved_len = size;
567
568 b->shadow = buf;
569
570 return out;
571 }
572
573 } else {
574 out = NULL;
575 size = len + buf->last - src;
576 src = buf->pos;
577 }
578
579 if (size < NGX_HTML_ENTITY_LEN) {
580 size += NGX_HTML_ENTITY_LEN;
581 }
582
583 cl = ngx_http_charset_get_buffer(pool, ctx, size);
584 if (cl == NULL) {
585 return NULL;
586 }
587
588 if (out) {
589 out->next = cl;
590
591 } else {
592 out = cl;
593 }
594
595 b = cl->buf;
596 dst = b->pos;
597
598 goto recode;
599 }
600
601 out = ngx_alloc_chain_link(pool);
602 if (out == NULL) {
603 return NULL;
604 }
605
606 out->buf = buf;
607 out->next = NULL;
608
609 return out;
610 }
611
612 /* process incomplete UTF sequence from previous buffer */
613
614 ngx_log_debug1(NGX_LOG_DEBUG_HTTP, pool->log, 0,
615 "http charset utf saved: %z", ctx->saved_len);
616
617 p = src;
618
619 for (i = ctx->saved_len; i < NGX_UTF_LEN; i++) {
620 ctx->saved[i] = *p++;
621
622 if (p == buf->last) {
623 break;
624 }
625 }
626
627 saved = ctx->saved;
628 n = ngx_utf_decode(&saved, i);
629
630 c = '\0';
631
632 if (n < 0x10000) {
633 table = (u_char **) ctx->table;
634 p = table[n >> 8];
635
636 if (p) {
637 c = p[n & 0xff];
638 }
639
640 } else if (n == 0xfffffffe) {
641
642 /* incomplete UTF-8 symbol */
643
644 if (i < NGX_UTF_LEN) {
645 out = ngx_http_charset_get_buf(pool, ctx);
646 if (out == NULL) {
647 return NULL;
648 }
649
650 b = out->buf;
651
652 b->pos = buf->pos;
653 b->last = buf->last;
654 b->sync = 1;
655 b->shadow = buf;
656
657 ngx_memcpy(&ctx->saved[ctx->saved_len], src, i);
658 ctx->saved_len += i;
659
660 return out;
661 }
662 }
663
664 size = buf->last - buf->pos;
665
666 if (size < NGX_HTML_ENTITY_LEN) {
667 size += NGX_HTML_ENTITY_LEN;
668 }
669
670 cl = ngx_http_charset_get_buffer(pool, ctx, size);
671 if (cl == NULL) {
672 return NULL;
673 }
674
675 out = cl;
676
677 b = cl->buf;
678 dst = b->pos;
679
680 if (c) {
681 *dst++ = c;
682
683 } else if (n == 0xfffffffe) {
684 *dst++ = '?';
685
686 ngx_log_debug0(NGX_LOG_DEBUG_HTTP, pool->log, 0,
687 "http charset invalid utf 0");
688
689 saved = &ctx->saved[NGX_UTF_LEN];
690
691 } else if (n > 0x10ffff) {
692 *dst++ = '?';
693
694 ngx_log_debug0(NGX_LOG_DEBUG_HTTP, pool->log, 0,
695 "http charset invalid utf 1");
696
697 } else {
698 dst = ngx_sprintf(dst, "&#%uD;", n);
699 }
700
701 src += (saved - ctx->saved) - ctx->saved_len;
702 ctx->saved_len = 0;
703
704 recode:
705
706 ll = &cl->next;
707
708 table = (u_char **) ctx->table;
709
710 while (src < buf->last) {
711
712 if ((size_t) (b->end - dst) < NGX_HTML_ENTITY_LEN) {
713 b->last = dst;
714
715 size = buf->last - src + NGX_HTML_ENTITY_LEN;
716
717 cl = ngx_http_charset_get_buffer(pool, ctx, size);
718 if (cl == NULL) {
719 return NULL;
720 }
721
722 *ll = cl;
723 ll = &cl->next;
724
725 b = cl->buf;
726 dst = b->pos;
727 }
728
729 if (*src < 0x80) {
730 *dst++ = *src++;
731 continue;
732 }
733
734 len = buf->last - src;
735
736 n = ngx_utf_decode(&src, len);
737
738 if (n < 0x10000) {
739
740 p = table[n >> 8];
741
742 if (p) {
743 c = p[n & 0xff];
744
745 if (c) {
746 *dst++ = c;
747 continue;
748 }
749 }
750
751 dst = ngx_sprintf(dst, "&#%uD;", n);
752
753 continue;
754 }
755
756 if (n == 0xfffffffe) {
757 /* incomplete UTF-8 symbol */
758
759 ngx_memcpy(ctx->saved, src, len);
760 ctx->saved_len = len;
761
762 if (b->pos == dst) {
763 b->sync = 1;
764 b->temporary = 0;
765 }
766
767 break;
768 }
769
770 if (n > 0x10ffff) {
771 *dst++ = '?';
772
773 ngx_log_debug0(NGX_LOG_DEBUG_HTTP, pool->log, 0,
774 "http charset invalid utf 2");
775
776 continue;
777 }
778
779 /* n > 0xffff */
780
781 dst = ngx_sprintf(dst, "&#%uD;", n);
782 }
783
784 b->last = dst;
785
786 b->last_buf = buf->last_buf;
787 b->last_in_chain = buf->last_in_chain;
788 b->flush = buf->flush;
789
790 b->shadow = buf;
791
792 return out;
793 }
794
795
796 static ngx_chain_t *
797 ngx_http_charset_recode_to_utf8(ngx_pool_t *pool, ngx_buf_t *buf,
798 ngx_http_charset_ctx_t *ctx)
799 {
800 size_t len, size;
801 u_char *p, *src, *dst, *table;
802 ngx_buf_t *b;
803 ngx_chain_t *out, *cl, **ll;
804
805 table = ctx->table;
806
807 for (src = buf->pos; src < buf->last; src++) {
808 if (table[*src * NGX_UTF_LEN] == '\1') {
809 continue;
810 }
811
812 goto recode;
813 }
814
815 out = ngx_alloc_chain_link(pool);
816 if (out == NULL) {
817 return NULL;
818 }
819
820 out->buf = buf;
821 out->next = NULL;
822
823 return out;
824
825 recode:
826
827 /*
828 * we assume that there are about half of characters to be recoded,
829 * so we preallocate "size / 2 + size / 2 * ctx->length"
830 */
831
832 len = src - buf->pos;
833
834 if (len > 512) {
835 out = ngx_http_charset_get_buf(pool, ctx);
836 if (out == NULL) {
837 return NULL;
838 }
839
840 b = out->buf;
841
842 b->temporary = buf->temporary;
843 b->memory = buf->memory;
844 b->mmap = buf->mmap;
845 b->flush = buf->flush;
846
847 b->pos = buf->pos;
848 b->last = src;
849
850 out->buf = b;
851 out->next = NULL;
852
853 size = buf->last - src;
854 size = size / 2 + size / 2 * ctx->length;
855
856 } else {
857 out = NULL;
858
859 size = buf->last - src;
860 size = len + size / 2 + size / 2 * ctx->length;
861
862 src = buf->pos;
863 }
864
865 cl = ngx_http_charset_get_buffer(pool, ctx, size);
866 if (cl == NULL) {
867 return NULL;
868 }
869
870 if (out) {
871 out->next = cl;
872
873 } else {
874 out = cl;
875 }
876
877 ll = &cl->next;
878
879 b = cl->buf;
880 dst = b->pos;
881
882 while (src < buf->last) {
883
884 p = &table[*src++ * NGX_UTF_LEN];
885 len = *p++;
886
887 if ((size_t) (b->end - dst) < len) {
888 b->last = dst;
889
890 size = buf->last - src;
891 size = len + size / 2 + size / 2 * ctx->length;
892
893 cl = ngx_http_charset_get_buffer(pool, ctx, size);
894 if (cl == NULL) {
895 return NULL;
896 }
897
898 *ll = cl;
899 ll = &cl->next;
900
901 b = cl->buf;
902 dst = b->pos;
903 }
904
905 while (len) {
906 *dst++ = *p++;
907 len--;
908 }
909 }
910
911 b->last = dst;
912
913 b->last_buf = buf->last_buf;
914 b->last_in_chain = buf->last_in_chain;
915 b->flush = buf->flush;
916
917 b->shadow = buf;
918
919 return out;
920 }
921
922
923 static ngx_chain_t *
924 ngx_http_charset_get_buf(ngx_pool_t *pool, ngx_http_charset_ctx_t *ctx)
925 {
926 ngx_chain_t *cl;
927
928 cl = ctx->free_bufs;
929
930 if (cl) {
931 ctx->free_bufs = cl->next;
932
933 cl->buf->shadow = NULL;
934 cl->next = NULL;
935
936 return cl;
937 }
938
939 cl = ngx_alloc_chain_link(pool);
940 if (cl == NULL) {
941 return NULL;
942 }
943
944 cl->buf = ngx_calloc_buf(pool);
945 if (cl->buf == NULL) {
946 return NULL;
947 }
948
949 cl->next = NULL;
950
951 cl->buf->tag = (ngx_buf_tag_t) &ngx_http_charset_filter_module;
952
953 return cl;
954 }
955
956
957 static ngx_chain_t *
958 ngx_http_charset_get_buffer(ngx_pool_t *pool, ngx_http_charset_ctx_t *ctx,
959 size_t size)
960 {
961 ngx_buf_t *b;
962 ngx_chain_t *cl, **ll;
963
964 for (ll = &ctx->free_buffers, cl = ctx->free_buffers;
965 cl;
966 ll = &cl->next, cl = cl->next)
967 {
968 b = cl->buf;
969
970 if ((size_t) (b->end - b->start) >= size) {
971 *ll = cl->next;
972 cl->next = NULL;
973
974 b->pos = b->start;
975 b->temporary = 1;
976 b->shadow = NULL;
977
978 return cl;
979 }
980 }
981
982 cl = ngx_alloc_chain_link(pool);
983 if (cl == NULL) {
984 return NULL;
985 }
986
987 cl->buf = ngx_create_temp_buf(pool, size);
988 if (cl->buf == NULL) {
989 return NULL;
990 }
991
992 cl->next = NULL;
993
994 cl->buf->temporary = 1;
995 cl->buf->tag = (ngx_buf_tag_t) &ngx_http_charset_filter_module;
996
997 return cl;
998 }
999
1000
356 static char * 1001 static char *
357 ngx_charset_map_block(ngx_conf_t *cf, ngx_command_t *cmd, void *conf) 1002 ngx_http_charset_map_block(ngx_conf_t *cf, ngx_command_t *cmd, void *conf)
358 { 1003 {
359 ngx_http_charset_main_conf_t *mcf = conf; 1004 ngx_http_charset_main_conf_t *mcf = conf;
360 1005
361 char *rv; 1006 char *rv;
362 ngx_int_t src, dst; 1007 u_char *p, *dst2src, **pp;
363 ngx_uint_t i; 1008 ngx_int_t src, dst;
364 ngx_str_t *value; 1009 ngx_uint_t i, n;
365 ngx_conf_t pvcf; 1010 ngx_str_t *value;
366 ngx_http_charset_tables_t *table; 1011 ngx_conf_t pvcf;
1012 ngx_http_charset_t *charset;
1013 ngx_http_charset_tables_t *table;
1014 ngx_http_charset_conf_ctx_t ctx;
367 1015
368 value = cf->args->elts; 1016 value = cf->args->elts;
369 1017
370 src = ngx_http_add_charset(&mcf->charsets, &value[1]); 1018 src = ngx_http_add_charset(&mcf->charsets, &value[1]);
371 if (src == NGX_ERROR) { 1019 if (src == NGX_ERROR) {
402 } 1050 }
403 1051
404 table->src = src; 1052 table->src = src;
405 table->dst = dst; 1053 table->dst = dst;
406 1054
407 table->src2dst = ngx_palloc(cf->pool, 256); 1055 if (ngx_strcasecmp(value[2].data, "utf-8") == 0) {
408 if (table->src2dst == NULL) { 1056 table->src2dst = ngx_pcalloc(cf->pool, 256 * NGX_UTF_LEN);
409 return NGX_CONF_ERROR; 1057 if (table->src2dst == NULL) {
410 } 1058 return NGX_CONF_ERROR;
411 1059 }
412 table->dst2src = ngx_palloc(cf->pool, 256); 1060
413 if (table->dst2src == NULL) { 1061 table->dst2src = ngx_pcalloc(cf->pool, 256 * sizeof(void *));
414 return NGX_CONF_ERROR; 1062 if (table->dst2src == NULL) {
415 } 1063 return NGX_CONF_ERROR;
416 1064 }
417 for (i = 0; i < 128; i++) { 1065
418 table->src2dst[i] = (u_char) i; 1066 dst2src = ngx_pcalloc(cf->pool, 256);
419 table->dst2src[i] = (u_char) i; 1067 if (dst2src == NULL) {
420 } 1068 return NGX_CONF_ERROR;
421 1069 }
422 for (/* void */; i < 256; i++) { 1070
423 table->src2dst[i] = '?'; 1071 pp = (u_char **) &table->dst2src[0];
424 table->dst2src[i] = '?'; 1072 pp[0] = dst2src;
425 } 1073
1074 for (i = 0; i < 128; i++) {
1075 p = &table->src2dst[i * NGX_UTF_LEN];
1076 p[0] = '\1';
1077 p[1] = (u_char) i;
1078 dst2src[i] = (u_char) i;
1079 }
1080
1081 for (/* void */; i < 256; i++) {
1082 p = &table->src2dst[i * NGX_UTF_LEN];
1083 p[0] = '\1';
1084 p[1] = '?';
1085 }
1086
1087 } else {
1088 table->src2dst = ngx_palloc(cf->pool, 256);
1089 if (table->src2dst == NULL) {
1090 return NGX_CONF_ERROR;
1091 }
1092
1093 table->dst2src = ngx_palloc(cf->pool, 256);
1094 if (table->dst2src == NULL) {
1095 return NGX_CONF_ERROR;
1096 }
1097
1098 for (i = 0; i < 128; i++) {
1099 table->src2dst[i] = (u_char) i;
1100 table->dst2src[i] = (u_char) i;
1101 }
1102
1103 for (/* void */; i < 256; i++) {
1104 table->src2dst[i] = '?';
1105 table->dst2src[i] = '?';
1106 }
1107 }
1108
1109 charset = mcf->charsets.elts;
1110
1111 ctx.table = table;
1112 ctx.charset = &charset[dst];
1113 ctx.characters = 0;
426 1114
427 pvcf = *cf; 1115 pvcf = *cf;
428 cf->ctx = table; 1116 cf->ctx = &ctx;
429 cf->handler = ngx_charset_map; 1117 cf->handler = ngx_http_charset_map;
430 cf->handler_conf = conf; 1118 cf->handler_conf = conf;
431 1119
432 rv = ngx_conf_parse(cf, NULL); 1120 rv = ngx_conf_parse(cf, NULL);
433 1121
434 *cf = pvcf; 1122 *cf = pvcf;
435 1123
1124 if (ctx.characters) {
1125 n = ctx.charset->length;
1126 ctx.charset->length /= ctx.characters;
1127
1128 if (((n * 10) / ctx.characters) % 10 > 4) {
1129 ctx.charset->length++;
1130 }
1131 }
1132
436 return rv; 1133 return rv;
437 } 1134 }
438 1135
439 1136
440 static char * 1137 static char *
441 ngx_charset_map(ngx_conf_t *cf, ngx_command_t *dummy, void *conf) 1138 ngx_http_charset_map(ngx_conf_t *cf, ngx_command_t *dummy, void *conf)
442 { 1139 {
443 ngx_int_t src, dst; 1140 u_char *p, *dst2src, **pp;
444 ngx_str_t *value; 1141 uint32_t n;
445 ngx_http_charset_tables_t *table; 1142 ngx_int_t src, dst;
1143 ngx_str_t *value;
1144 ngx_uint_t i;
1145 ngx_http_charset_tables_t *table;
1146 ngx_http_charset_conf_ctx_t *ctx;
446 1147
447 if (cf->args->nelts != 2) { 1148 if (cf->args->nelts != 2) {
448 ngx_conf_log_error(NGX_LOG_EMERG, cf, 0, "invalid parameters number"); 1149 ngx_conf_log_error(NGX_LOG_EMERG, cf, 0, "invalid parameters number");
449 return NGX_CONF_ERROR; 1150 return NGX_CONF_ERROR;
450 } 1151 }
456 ngx_conf_log_error(NGX_LOG_EMERG, cf, 0, 1157 ngx_conf_log_error(NGX_LOG_EMERG, cf, 0,
457 "invalid value \"%V\"", &value[0]); 1158 "invalid value \"%V\"", &value[0]);
458 return NGX_CONF_ERROR; 1159 return NGX_CONF_ERROR;
459 } 1160 }
460 1161
461 dst = ngx_hextoi(value[1].data, value[1].len); 1162 ctx = cf->ctx;
462 if (dst == NGX_ERROR || dst > 255) { 1163 table = ctx->table;
463 ngx_conf_log_error(NGX_LOG_EMERG, cf, 0, 1164
464 "invalid value \"%V\"", &value[1]); 1165 if (ctx->charset->utf8) {
465 return NGX_CONF_ERROR; 1166 p = &table->src2dst[src * NGX_UTF_LEN];
466 } 1167
467 1168 *p++ = (u_char) (value[1].len / 2);
468 table = cf->ctx; 1169
469 1170 for (i = 0; i < value[1].len; i += 2) {
470 table->src2dst[src] = (u_char) dst; 1171 dst = ngx_hextoi(&value[1].data[i], 2);
471 table->dst2src[dst] = (u_char) src; 1172 if (dst == NGX_ERROR || dst > 255) {
1173 ngx_conf_log_error(NGX_LOG_EMERG, cf, 0,
1174 "invalid value \"%V\"", &value[1]);
1175 return NGX_CONF_ERROR;
1176 }
1177
1178 *p++ = (u_char) dst;
1179 }
1180
1181 i /= 2;
1182
1183 ctx->charset->length += i;
1184 ctx->characters++;
1185
1186 p = &table->src2dst[src * NGX_UTF_LEN] + 1;
1187
1188 n = ngx_utf_decode(&p, i);
1189
1190 if (n > 0xffff) {
1191 ngx_conf_log_error(NGX_LOG_EMERG, cf, 0,
1192 "invalid value \"%V\"", &value[1]);
1193 return NGX_CONF_ERROR;
1194 }
1195
1196 pp = (u_char **) &table->dst2src[0];
1197
1198 dst2src = pp[n >> 8];
1199
1200 if (dst2src == NULL) {
1201 dst2src = ngx_pcalloc(cf->pool, 256);
1202 if (dst2src == NULL) {
1203 return NGX_CONF_ERROR;
1204 }
1205
1206 pp[n >> 8] = dst2src;
1207 }
1208
1209 dst2src[n & 0xff] = (u_char) src;
1210
1211 } else {
1212 dst = ngx_hextoi(value[1].data, value[1].len);
1213 if (dst == NGX_ERROR || dst > 255) {
1214 ngx_conf_log_error(NGX_LOG_EMERG, cf, 0,
1215 "invalid value \"%V\"", &value[1]);
1216 return NGX_CONF_ERROR;
1217 }
1218
1219 table->src2dst[src] = (u_char) dst;
1220 table->dst2src[dst] = (u_char) src;
1221 }
472 1222
473 return NGX_CONF_OK; 1223 return NGX_CONF_OK;
474 } 1224 }
475 1225
476 1226
536 return NGX_ERROR; 1286 return NGX_ERROR;
537 } 1287 }
538 1288
539 c->tables = NULL; 1289 c->tables = NULL;
540 c->name = *name; 1290 c->name = *name;
1291 c->length = 0;
541 1292
542 if (ngx_strcasecmp(name->data, "utf-8") == 0) { 1293 if (ngx_strcasecmp(name->data, "utf-8") == 0) {
543 c->utf8 = 1; 1294 c->utf8 = 1;
544 } 1295 }
545 1296