Ruby 3.1.3p185 (2022-11-24 revision 1a6b16756e0ba6b95ab71a441357ed5484e33498)
transcode.c
1/**********************************************************************
2
3 transcode.c -
4
5 $Author$
6 created at: Tue Oct 30 16:10:22 JST 2007
7
8 Copyright (C) 2007 Martin Duerst
9
10**********************************************************************/
11
12#include "ruby/internal/config.h"
13
14#include <ctype.h>
15
16#include "internal.h"
17#include "internal/array.h"
18#include "internal/inits.h"
19#include "internal/object.h"
20#include "internal/string.h"
21#include "internal/transcode.h"
22#include "ruby/encoding.h"
23
24#include "transcode_data.h"
25#include "id.h"
26
27#define ENABLE_ECONV_NEWLINE_OPTION 1
28
29/* VALUE rb_cEncoding = rb_define_class("Encoding", rb_cObject); */
30static VALUE rb_eUndefinedConversionError;
31static VALUE rb_eInvalidByteSequenceError;
32static VALUE rb_eConverterNotFoundError;
33
34VALUE rb_cEncodingConverter;
35
36static ID id_destination_encoding;
37static ID id_destination_encoding_name;
38static ID id_error_bytes;
39static ID id_error_char;
40static ID id_incomplete_input;
41static ID id_readagain_bytes;
42static ID id_source_encoding;
43static ID id_source_encoding_name;
44
45static VALUE sym_invalid, sym_undef, sym_replace, sym_fallback;
46static VALUE sym_xml, sym_text, sym_attr;
47static VALUE sym_universal_newline;
48static VALUE sym_crlf_newline;
49static VALUE sym_cr_newline;
50#ifdef ENABLE_ECONV_NEWLINE_OPTION
51static VALUE sym_newline, sym_universal, sym_crlf, sym_cr, sym_lf;
52#endif
53static VALUE sym_partial_input;
54
55static VALUE sym_invalid_byte_sequence;
56static VALUE sym_undefined_conversion;
57static VALUE sym_destination_buffer_full;
58static VALUE sym_source_buffer_empty;
59static VALUE sym_finished;
60static VALUE sym_after_output;
61static VALUE sym_incomplete_input;
62
63static unsigned char *
64allocate_converted_string(const char *sname, const char *dname,
65 const unsigned char *str, size_t len,
66 unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
67 size_t *dst_len_ptr);
68
69/* dynamic structure, one per conversion (similar to iconv_t) */
70/* may carry conversion state (e.g. for iso-2022-jp) */
71typedef struct rb_transcoding {
72 const rb_transcoder *transcoder;
73
74 int flags;
75
76 int resume_position;
77 unsigned int next_table;
78 VALUE next_info;
79 unsigned char next_byte;
80 unsigned int output_index;
81
82 ssize_t recognized_len; /* already interpreted */
83 ssize_t readagain_len; /* not yet interpreted */
84 union {
85 unsigned char ary[8]; /* max_input <= sizeof(ary) */
86 unsigned char *ptr; /* length: max_input */
87 } readbuf; /* recognized_len + readagain_len used */
88
89 ssize_t writebuf_off;
90 ssize_t writebuf_len;
91 union {
92 unsigned char ary[8]; /* max_output <= sizeof(ary) */
93 unsigned char *ptr; /* length: max_output */
94 } writebuf;
95
96 union rb_transcoding_state_t { /* opaque data for stateful encoding */
97 void *ptr;
98 char ary[sizeof(double) > sizeof(void*) ? sizeof(double) : sizeof(void*)];
99 double dummy_for_alignment;
100 } state;
102#define TRANSCODING_READBUF(tc) \
103 ((tc)->transcoder->max_input <= (int)sizeof((tc)->readbuf.ary) ? \
104 (tc)->readbuf.ary : \
105 (tc)->readbuf.ptr)
106#define TRANSCODING_WRITEBUF(tc) \
107 ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
108 (tc)->writebuf.ary : \
109 (tc)->writebuf.ptr)
110#define TRANSCODING_WRITEBUF_SIZE(tc) \
111 ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
112 sizeof((tc)->writebuf.ary) : \
113 (size_t)(tc)->transcoder->max_output)
114#define TRANSCODING_STATE_EMBED_MAX ((int)sizeof(union rb_transcoding_state_t))
115#define TRANSCODING_STATE(tc) \
116 ((tc)->transcoder->state_size <= (int)sizeof((tc)->state) ? \
117 (tc)->state.ary : \
118 (tc)->state.ptr)
119
120typedef struct {
121 struct rb_transcoding *tc;
122 unsigned char *out_buf_start;
123 unsigned char *out_data_start;
124 unsigned char *out_data_end;
125 unsigned char *out_buf_end;
126 rb_econv_result_t last_result;
128
130 int flags;
131 int started; /* bool */
132
133 const char *source_encoding_name;
134 const char *destination_encoding_name;
135
136 const unsigned char *replacement_str;
137 size_t replacement_len;
138 const char *replacement_enc;
139
140 unsigned char *in_buf_start;
141 unsigned char *in_data_start;
142 unsigned char *in_data_end;
143 unsigned char *in_buf_end;
144 rb_econv_elem_t *elems;
145 int replacement_allocated; /* bool */
146 int num_allocated;
147 int num_trans;
148 int num_finished;
149 struct rb_transcoding *last_tc;
150
151 /* last error */
152 struct {
153 rb_econv_result_t result;
154 struct rb_transcoding *error_tc;
155 const char *source_encoding;
156 const char *destination_encoding;
157 const unsigned char *error_bytes_start;
158 size_t error_bytes_len;
159 size_t readagain_len;
160 } last_error;
161
162 /* The following fields are only for Encoding::Converter.
163 * rb_econv_open set them NULL. */
164 rb_encoding *source_encoding;
165 rb_encoding *destination_encoding;
166};
167
168/*
169 * Dispatch data and logic
170 */
171
172#define DECORATOR_P(sname, dname) (*(sname) == '\0')
173
174typedef struct {
175 const char *sname;
176 const char *dname;
177 const char *lib; /* null means no need to load a library */
178 const rb_transcoder *transcoder;
180
181static st_table *transcoder_table;
182
183static transcoder_entry_t *
184make_transcoder_entry(const char *sname, const char *dname)
185{
186 st_data_t val;
187 st_table *table2;
188
189 if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
190 val = (st_data_t)st_init_strcasetable();
191 st_add_direct(transcoder_table, (st_data_t)sname, val);
192 }
193 table2 = (st_table *)val;
194 if (!st_lookup(table2, (st_data_t)dname, &val)) {
196 entry->sname = sname;
197 entry->dname = dname;
198 entry->lib = NULL;
199 entry->transcoder = NULL;
200 val = (st_data_t)entry;
201 st_add_direct(table2, (st_data_t)dname, val);
202 }
203 return (transcoder_entry_t *)val;
204}
205
206static transcoder_entry_t *
207get_transcoder_entry(const char *sname, const char *dname)
208{
209 st_data_t val;
210 st_table *table2;
211
212 if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
213 return NULL;
214 }
215 table2 = (st_table *)val;
216 if (!st_lookup(table2, (st_data_t)dname, &val)) {
217 return NULL;
218 }
219 return (transcoder_entry_t *)val;
220}
221
222void
223rb_register_transcoder(const rb_transcoder *tr)
224{
225 const char *const sname = tr->src_encoding;
226 const char *const dname = tr->dst_encoding;
227
228 transcoder_entry_t *entry;
229
230 entry = make_transcoder_entry(sname, dname);
231 if (entry->transcoder) {
232 rb_raise(rb_eArgError, "transcoder from %s to %s has been already registered",
233 sname, dname);
234 }
235
236 entry->transcoder = tr;
237}
238
239static void
240declare_transcoder(const char *sname, const char *dname, const char *lib)
241{
242 transcoder_entry_t *entry;
243
244 entry = make_transcoder_entry(sname, dname);
245 entry->lib = lib;
246}
247
248static const char transcoder_lib_prefix[] = "enc/trans/";
249
250void
251rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib)
252{
253 if (!lib) {
254 rb_raise(rb_eArgError, "invalid library name - (null)");
255 }
256 declare_transcoder(enc1, enc2, lib);
257}
258
259#define encoding_equal(enc1, enc2) (STRCASECMP((enc1), (enc2)) == 0)
260
261typedef struct search_path_queue_tag {
262 struct search_path_queue_tag *next;
263 const char *enc;
265
266typedef struct {
267 st_table *visited;
268 search_path_queue_t *queue;
269 search_path_queue_t **queue_last_ptr;
270 const char *base_enc;
272
273static int
274transcode_search_path_i(st_data_t key, st_data_t val, st_data_t arg)
275{
276 const char *dname = (const char *)key;
279
280 if (st_lookup(bfs->visited, (st_data_t)dname, &val)) {
281 return ST_CONTINUE;
282 }
283
285 q->enc = dname;
286 q->next = NULL;
287 *bfs->queue_last_ptr = q;
288 bfs->queue_last_ptr = &q->next;
289
290 st_add_direct(bfs->visited, (st_data_t)dname, (st_data_t)bfs->base_enc);
291 return ST_CONTINUE;
292}
293
294static int
295transcode_search_path(const char *sname, const char *dname,
296 void (*callback)(const char *sname, const char *dname, int depth, void *arg),
297 void *arg)
298{
301 st_data_t val;
302 st_table *table2;
303 int found;
304 int pathlen = -1;
305
306 if (encoding_equal(sname, dname))
307 return -1;
308
310 q->enc = sname;
311 q->next = NULL;
312 bfs.queue_last_ptr = &q->next;
313 bfs.queue = q;
314
315 bfs.visited = st_init_strcasetable();
316 st_add_direct(bfs.visited, (st_data_t)sname, (st_data_t)NULL);
317
318 while (bfs.queue) {
319 q = bfs.queue;
320 bfs.queue = q->next;
321 if (!bfs.queue)
322 bfs.queue_last_ptr = &bfs.queue;
323
324 if (!st_lookup(transcoder_table, (st_data_t)q->enc, &val)) {
325 xfree(q);
326 continue;
327 }
328 table2 = (st_table *)val;
329
330 if (st_lookup(table2, (st_data_t)dname, &val)) {
331 st_add_direct(bfs.visited, (st_data_t)dname, (st_data_t)q->enc);
332 xfree(q);
333 found = 1;
334 goto cleanup;
335 }
336
337 bfs.base_enc = q->enc;
338 st_foreach(table2, transcode_search_path_i, (st_data_t)&bfs);
339 bfs.base_enc = NULL;
340
341 xfree(q);
342 }
343 found = 0;
344
345 cleanup:
346 while (bfs.queue) {
347 q = bfs.queue;
348 bfs.queue = q->next;
349 xfree(q);
350 }
351
352 if (found) {
353 const char *enc = dname;
354 int depth;
355 pathlen = 0;
356 while (1) {
357 st_lookup(bfs.visited, (st_data_t)enc, &val);
358 if (!val)
359 break;
360 pathlen++;
361 enc = (const char *)val;
362 }
363 depth = pathlen;
364 enc = dname;
365 while (1) {
366 st_lookup(bfs.visited, (st_data_t)enc, &val);
367 if (!val)
368 break;
369 callback((const char *)val, enc, --depth, arg);
370 enc = (const char *)val;
371 }
372 }
373
374 st_free_table(bfs.visited);
375
376 return pathlen; /* is -1 if not found */
377}
378
379int rb_require_internal_silent(VALUE fname);
380
381static const rb_transcoder *
382load_transcoder_entry(transcoder_entry_t *entry)
383{
384 if (entry->transcoder)
385 return entry->transcoder;
386
387 if (entry->lib) {
388 const char *const lib = entry->lib;
389 const size_t len = strlen(lib);
390 const size_t total_len = sizeof(transcoder_lib_prefix) - 1 + len;
391 const VALUE fn = rb_str_new(0, total_len);
392 char *const path = RSTRING_PTR(fn);
393
394 memcpy(path, transcoder_lib_prefix, sizeof(transcoder_lib_prefix) - 1);
395 memcpy(path + sizeof(transcoder_lib_prefix) - 1, lib, len);
396 rb_str_set_len(fn, total_len);
397 OBJ_FREEZE(fn);
398 rb_require_internal_silent(fn);
399 }
400
401 if (entry->transcoder)
402 return entry->transcoder;
403
404 return NULL;
405}
406
407static const char*
408get_replacement_character(const char *encname, size_t *len_ret, const char **repl_encname_ptr)
409{
410 if (encoding_equal(encname, "UTF-8")) {
411 *len_ret = 3;
412 *repl_encname_ptr = "UTF-8";
413 return "\xEF\xBF\xBD";
414 }
415 else {
416 *len_ret = 1;
417 *repl_encname_ptr = "US-ASCII";
418 return "?";
419 }
420}
421
422/*
423 * Transcoding engine logic
424 */
425
426static const unsigned char *
427transcode_char_start(rb_transcoding *tc,
428 const unsigned char *in_start,
429 const unsigned char *inchar_start,
430 const unsigned char *in_p,
431 size_t *char_len_ptr)
432{
433 const unsigned char *ptr;
434 if (inchar_start - in_start < tc->recognized_len) {
435 MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len,
436 inchar_start, unsigned char, in_p - inchar_start);
437 ptr = TRANSCODING_READBUF(tc);
438 }
439 else {
440 ptr = inchar_start - tc->recognized_len;
441 }
442 *char_len_ptr = tc->recognized_len + (in_p - inchar_start);
443 return ptr;
444}
445
446static rb_econv_result_t
447transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos,
448 const unsigned char *in_stop, unsigned char *out_stop,
449 rb_transcoding *tc,
450 const int opt)
451{
452 const rb_transcoder *tr = tc->transcoder;
453 int unitlen = tr->input_unit_length;
454 ssize_t readagain_len = 0;
455
456 const unsigned char *inchar_start;
457 const unsigned char *in_p;
458
459 unsigned char *out_p;
460
461 in_p = inchar_start = *in_pos;
462
463 out_p = *out_pos;
464
465#define SUSPEND(ret, num) \
466 do { \
467 tc->resume_position = (num); \
468 if (0 < in_p - inchar_start) \
469 MEMMOVE(TRANSCODING_READBUF(tc)+tc->recognized_len, \
470 inchar_start, unsigned char, in_p - inchar_start); \
471 *in_pos = in_p; \
472 *out_pos = out_p; \
473 tc->recognized_len += in_p - inchar_start; \
474 if (readagain_len) { \
475 tc->recognized_len -= readagain_len; \
476 tc->readagain_len = readagain_len; \
477 } \
478 return (ret); \
479 resume_label ## num:; \
480 } while (0)
481#define SUSPEND_OBUF(num) \
482 do { \
483 while (out_stop - out_p < 1) { SUSPEND(econv_destination_buffer_full, num); } \
484 } while (0)
485
486#define SUSPEND_AFTER_OUTPUT(num) \
487 if ((opt & ECONV_AFTER_OUTPUT) && *out_pos != out_p) { \
488 SUSPEND(econv_after_output, num); \
489 }
490
491#define next_table (tc->next_table)
492#define next_info (tc->next_info)
493#define next_byte (tc->next_byte)
494#define writebuf_len (tc->writebuf_len)
495#define writebuf_off (tc->writebuf_off)
496
497 switch (tc->resume_position) {
498 case 0: break;
499 case 1: goto resume_label1;
500 case 2: goto resume_label2;
501 case 3: goto resume_label3;
502 case 4: goto resume_label4;
503 case 5: goto resume_label5;
504 case 6: goto resume_label6;
505 case 7: goto resume_label7;
506 case 8: goto resume_label8;
507 case 9: goto resume_label9;
508 case 10: goto resume_label10;
509 case 11: goto resume_label11;
510 case 12: goto resume_label12;
511 case 13: goto resume_label13;
512 case 14: goto resume_label14;
513 case 15: goto resume_label15;
514 case 16: goto resume_label16;
515 case 17: goto resume_label17;
516 case 18: goto resume_label18;
517 case 19: goto resume_label19;
518 case 20: goto resume_label20;
519 case 21: goto resume_label21;
520 case 22: goto resume_label22;
521 case 23: goto resume_label23;
522 case 24: goto resume_label24;
523 case 25: goto resume_label25;
524 case 26: goto resume_label26;
525 case 27: goto resume_label27;
526 case 28: goto resume_label28;
527 case 29: goto resume_label29;
528 case 30: goto resume_label30;
529 case 31: goto resume_label31;
530 case 32: goto resume_label32;
531 case 33: goto resume_label33;
532 case 34: goto resume_label34;
533 }
534
535 while (1) {
536 inchar_start = in_p;
537 tc->recognized_len = 0;
538 next_table = tr->conv_tree_start;
539
540 SUSPEND_AFTER_OUTPUT(24);
541
542 if (in_stop <= in_p) {
543 if (!(opt & ECONV_PARTIAL_INPUT))
544 break;
545 SUSPEND(econv_source_buffer_empty, 7);
546 continue;
547 }
548
549#define BYTE_ADDR(index) (tr->byte_array + (index))
550#define WORD_ADDR(index) (tr->word_array + INFO2WORDINDEX(index))
551#define BL_BASE BYTE_ADDR(BYTE_LOOKUP_BASE(WORD_ADDR(next_table)))
552#define BL_INFO WORD_ADDR(BYTE_LOOKUP_INFO(WORD_ADDR(next_table)))
553#define BL_MIN_BYTE (BL_BASE[0])
554#define BL_MAX_BYTE (BL_BASE[1])
555#define BL_OFFSET(byte) (BL_BASE[2+(byte)-BL_MIN_BYTE])
556#define BL_ACTION(byte) (BL_INFO[BL_OFFSET((byte))])
557
558 next_byte = (unsigned char)*in_p++;
559 follow_byte:
560 if (next_byte < BL_MIN_BYTE || BL_MAX_BYTE < next_byte)
561 next_info = INVALID;
562 else {
563 next_info = (VALUE)BL_ACTION(next_byte);
564 }
565 follow_info:
566 switch (next_info & 0x1F) {
567 case NOMAP:
568 {
569 const unsigned char *p = inchar_start;
570 writebuf_off = 0;
571 while (p < in_p) {
572 TRANSCODING_WRITEBUF(tc)[writebuf_off++] = (unsigned char)*p++;
573 }
574 writebuf_len = writebuf_off;
575 writebuf_off = 0;
576 while (writebuf_off < writebuf_len) {
577 SUSPEND_OBUF(3);
578 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
579 }
580 }
581 continue;
582 case 0x00: case 0x04: case 0x08: case 0x0C:
583 case 0x10: case 0x14: case 0x18: case 0x1C:
584 SUSPEND_AFTER_OUTPUT(25);
585 while (in_p >= in_stop) {
586 if (!(opt & ECONV_PARTIAL_INPUT))
587 goto incomplete;
588 SUSPEND(econv_source_buffer_empty, 5);
589 }
590 next_byte = (unsigned char)*in_p++;
591 next_table = (unsigned int)next_info;
592 goto follow_byte;
593 case ZERObt: /* drop input */
594 continue;
595 case ONEbt:
596 SUSPEND_OBUF(9); *out_p++ = getBT1(next_info);
597 continue;
598 case TWObt:
599 SUSPEND_OBUF(10); *out_p++ = getBT1(next_info);
600 SUSPEND_OBUF(21); *out_p++ = getBT2(next_info);
601 continue;
602 case THREEbt:
603 SUSPEND_OBUF(11); *out_p++ = getBT1(next_info);
604 SUSPEND_OBUF(15); *out_p++ = getBT2(next_info);
605 SUSPEND_OBUF(16); *out_p++ = getBT3(next_info);
606 continue;
607 case FOURbt:
608 SUSPEND_OBUF(12); *out_p++ = getBT0(next_info);
609 SUSPEND_OBUF(17); *out_p++ = getBT1(next_info);
610 SUSPEND_OBUF(18); *out_p++ = getBT2(next_info);
611 SUSPEND_OBUF(19); *out_p++ = getBT3(next_info);
612 continue;
613 case GB4bt:
614 SUSPEND_OBUF(29); *out_p++ = getGB4bt0(next_info);
615 SUSPEND_OBUF(30); *out_p++ = getGB4bt1(next_info);
616 SUSPEND_OBUF(31); *out_p++ = getGB4bt2(next_info);
617 SUSPEND_OBUF(32); *out_p++ = getGB4bt3(next_info);
618 continue;
619 case STR1:
620 tc->output_index = 0;
621 while (tc->output_index < STR1_LENGTH(BYTE_ADDR(STR1_BYTEINDEX(next_info)))) {
622 SUSPEND_OBUF(28); *out_p++ = BYTE_ADDR(STR1_BYTEINDEX(next_info))[1+tc->output_index];
623 tc->output_index++;
624 }
625 continue;
626 case FUNii:
627 next_info = (VALUE)(*tr->func_ii)(TRANSCODING_STATE(tc), next_info);
628 goto follow_info;
629 case FUNsi:
630 {
631 const unsigned char *char_start;
632 size_t char_len;
633 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
634 next_info = (VALUE)(*tr->func_si)(TRANSCODING_STATE(tc), char_start, (size_t)char_len);
635 goto follow_info;
636 }
637 case FUNio:
638 SUSPEND_OBUF(13);
639 if (tr->max_output <= out_stop - out_p)
640 out_p += tr->func_io(TRANSCODING_STATE(tc),
641 next_info, out_p, out_stop - out_p);
642 else {
643 writebuf_len = tr->func_io(TRANSCODING_STATE(tc),
644 next_info,
645 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
646 writebuf_off = 0;
647 while (writebuf_off < writebuf_len) {
648 SUSPEND_OBUF(20);
649 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
650 }
651 }
652 break;
653 case FUNso:
654 {
655 const unsigned char *char_start;
656 size_t char_len;
657 SUSPEND_OBUF(14);
658 if (tr->max_output <= out_stop - out_p) {
659 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
660 out_p += tr->func_so(TRANSCODING_STATE(tc),
661 char_start, (size_t)char_len,
662 out_p, out_stop - out_p);
663 }
664 else {
665 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
666 writebuf_len = tr->func_so(TRANSCODING_STATE(tc),
667 char_start, (size_t)char_len,
668 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
669 writebuf_off = 0;
670 while (writebuf_off < writebuf_len) {
671 SUSPEND_OBUF(22);
672 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
673 }
674 }
675 break;
676 }
677 case FUNsio:
678 {
679 const unsigned char *char_start;
680 size_t char_len;
681 SUSPEND_OBUF(33);
682 if (tr->max_output <= out_stop - out_p) {
683 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
684 out_p += tr->func_sio(TRANSCODING_STATE(tc),
685 char_start, (size_t)char_len, next_info,
686 out_p, out_stop - out_p);
687 }
688 else {
689 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
690 writebuf_len = tr->func_sio(TRANSCODING_STATE(tc),
691 char_start, (size_t)char_len, next_info,
692 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
693 writebuf_off = 0;
694 while (writebuf_off < writebuf_len) {
695 SUSPEND_OBUF(34);
696 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
697 }
698 }
699 break;
700 }
701 case INVALID:
702 if (tc->recognized_len + (in_p - inchar_start) <= unitlen) {
703 if (tc->recognized_len + (in_p - inchar_start) < unitlen)
704 SUSPEND_AFTER_OUTPUT(26);
705 while ((opt & ECONV_PARTIAL_INPUT) && tc->recognized_len + (in_stop - inchar_start) < unitlen) {
706 in_p = in_stop;
707 SUSPEND(econv_source_buffer_empty, 8);
708 }
709 if (tc->recognized_len + (in_stop - inchar_start) <= unitlen) {
710 in_p = in_stop;
711 }
712 else {
713 in_p = inchar_start + (unitlen - tc->recognized_len);
714 }
715 }
716 else {
717 ssize_t invalid_len; /* including the last byte which causes invalid */
718 ssize_t discard_len;
719 invalid_len = tc->recognized_len + (in_p - inchar_start);
720 discard_len = ((invalid_len - 1) / unitlen) * unitlen;
721 readagain_len = invalid_len - discard_len;
722 }
723 goto invalid;
724 case UNDEF:
725 goto undef;
726 default:
727 rb_raise(rb_eRuntimeError, "unknown transcoding instruction");
728 }
729 continue;
730
731 invalid:
732 SUSPEND(econv_invalid_byte_sequence, 1);
733 continue;
734
735 incomplete:
736 SUSPEND(econv_incomplete_input, 27);
737 continue;
738
739 undef:
740 SUSPEND(econv_undefined_conversion, 2);
741 continue;
742 }
743
744 /* cleanup */
745 if (tr->finish_func) {
746 SUSPEND_OBUF(4);
747 if (tr->max_output <= out_stop - out_p) {
748 out_p += tr->finish_func(TRANSCODING_STATE(tc),
749 out_p, out_stop - out_p);
750 }
751 else {
752 writebuf_len = tr->finish_func(TRANSCODING_STATE(tc),
753 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
754 writebuf_off = 0;
755 while (writebuf_off < writebuf_len) {
756 SUSPEND_OBUF(23);
757 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
758 }
759 }
760 }
761 while (1)
762 SUSPEND(econv_finished, 6);
763#undef SUSPEND
764#undef next_table
765#undef next_info
766#undef next_byte
767#undef writebuf_len
768#undef writebuf_off
769}
770
771static rb_econv_result_t
772transcode_restartable(const unsigned char **in_pos, unsigned char **out_pos,
773 const unsigned char *in_stop, unsigned char *out_stop,
774 rb_transcoding *tc,
775 const int opt)
776{
777 if (tc->readagain_len) {
778 unsigned char *readagain_buf = ALLOCA_N(unsigned char, tc->readagain_len);
779 const unsigned char *readagain_pos = readagain_buf;
780 const unsigned char *readagain_stop = readagain_buf + tc->readagain_len;
781 rb_econv_result_t res;
782
783 MEMCPY(readagain_buf, TRANSCODING_READBUF(tc) + tc->recognized_len,
784 unsigned char, tc->readagain_len);
785 tc->readagain_len = 0;
786 res = transcode_restartable0(&readagain_pos, out_pos, readagain_stop, out_stop, tc, opt|ECONV_PARTIAL_INPUT);
787 if (res != econv_source_buffer_empty) {
788 MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len,
789 readagain_pos, unsigned char, readagain_stop - readagain_pos);
790 tc->readagain_len += readagain_stop - readagain_pos;
791 return res;
792 }
793 }
794 return transcode_restartable0(in_pos, out_pos, in_stop, out_stop, tc, opt);
795}
796
797static rb_transcoding *
798rb_transcoding_open_by_transcoder(const rb_transcoder *tr, int flags)
799{
800 rb_transcoding *tc;
801
802 tc = ALLOC(rb_transcoding);
803 tc->transcoder = tr;
804 tc->flags = flags;
805 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
806 tc->state.ptr = xmalloc(tr->state_size);
807 if (tr->state_init_func) {
808 (tr->state_init_func)(TRANSCODING_STATE(tc)); /* xxx: check return value */
809 }
810 tc->resume_position = 0;
811 tc->recognized_len = 0;
812 tc->readagain_len = 0;
813 tc->writebuf_len = 0;
814 tc->writebuf_off = 0;
815 if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
816 tc->readbuf.ptr = xmalloc(tr->max_input);
817 }
818 if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
819 tc->writebuf.ptr = xmalloc(tr->max_output);
820 }
821 return tc;
822}
823
824static rb_econv_result_t
825rb_transcoding_convert(rb_transcoding *tc,
826 const unsigned char **input_ptr, const unsigned char *input_stop,
827 unsigned char **output_ptr, unsigned char *output_stop,
828 int flags)
829{
830 return transcode_restartable(
831 input_ptr, output_ptr,
832 input_stop, output_stop,
833 tc, flags);
834}
835
836static void
837rb_transcoding_close(rb_transcoding *tc)
838{
839 const rb_transcoder *tr = tc->transcoder;
840 if (tr->state_fini_func) {
841 (tr->state_fini_func)(TRANSCODING_STATE(tc)); /* check return value? */
842 }
843 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
844 xfree(tc->state.ptr);
845 if ((int)sizeof(tc->readbuf.ary) < tr->max_input)
846 xfree(tc->readbuf.ptr);
847 if ((int)sizeof(tc->writebuf.ary) < tr->max_output)
848 xfree(tc->writebuf.ptr);
849 xfree(tc);
850}
851
852static size_t
853rb_transcoding_memsize(rb_transcoding *tc)
854{
855 size_t size = sizeof(rb_transcoding);
856 const rb_transcoder *tr = tc->transcoder;
857
858 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size) {
859 size += tr->state_size;
860 }
861 if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
862 size += tr->max_input;
863 }
864 if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
865 size += tr->max_output;
866 }
867 return size;
868}
869
870static rb_econv_t *
871rb_econv_alloc(int n_hint)
872{
873 rb_econv_t *ec;
874
875 if (n_hint <= 0)
876 n_hint = 1;
877
878 ec = ALLOC(rb_econv_t);
879 ec->flags = 0;
880 ec->source_encoding_name = NULL;
881 ec->destination_encoding_name = NULL;
882 ec->started = 0;
883 ec->replacement_str = NULL;
884 ec->replacement_len = 0;
885 ec->replacement_enc = NULL;
886 ec->replacement_allocated = 0;
887 ec->in_buf_start = NULL;
888 ec->in_data_start = NULL;
889 ec->in_data_end = NULL;
890 ec->in_buf_end = NULL;
891 ec->num_allocated = n_hint;
892 ec->num_trans = 0;
893 ec->elems = ALLOC_N(rb_econv_elem_t, ec->num_allocated);
894 ec->num_finished = 0;
895 ec->last_tc = NULL;
896 ec->last_error.result = econv_source_buffer_empty;
897 ec->last_error.error_tc = NULL;
898 ec->last_error.source_encoding = NULL;
899 ec->last_error.destination_encoding = NULL;
900 ec->last_error.error_bytes_start = NULL;
901 ec->last_error.error_bytes_len = 0;
902 ec->last_error.readagain_len = 0;
903 ec->source_encoding = NULL;
904 ec->destination_encoding = NULL;
905 return ec;
906}
907
908static int
909rb_econv_add_transcoder_at(rb_econv_t *ec, const rb_transcoder *tr, int i)
910{
911 int n, j;
912 int bufsize = 4096;
913 unsigned char *p;
914
915 if (ec->num_trans == ec->num_allocated) {
916 n = ec->num_allocated * 2;
917 REALLOC_N(ec->elems, rb_econv_elem_t, n);
918 ec->num_allocated = n;
919 }
920
921 p = xmalloc(bufsize);
922
923 MEMMOVE(ec->elems+i+1, ec->elems+i, rb_econv_elem_t, ec->num_trans-i);
924
925 ec->elems[i].tc = rb_transcoding_open_by_transcoder(tr, 0);
926 ec->elems[i].out_buf_start = p;
927 ec->elems[i].out_buf_end = p + bufsize;
928 ec->elems[i].out_data_start = p;
929 ec->elems[i].out_data_end = p;
930 ec->elems[i].last_result = econv_source_buffer_empty;
931
932 ec->num_trans++;
933
934 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
935 for (j = ec->num_trans-1; i <= j; j--) {
936 rb_transcoding *tc = ec->elems[j].tc;
937 const rb_transcoder *tr2 = tc->transcoder;
938 if (!DECORATOR_P(tr2->src_encoding, tr2->dst_encoding)) {
939 ec->last_tc = tc;
940 break;
941 }
942 }
943
944 return 0;
945}
946
947static rb_econv_t *
948rb_econv_open_by_transcoder_entries(int n, transcoder_entry_t **entries)
949{
950 rb_econv_t *ec;
951 int i, ret;
952
953 for (i = 0; i < n; i++) {
954 const rb_transcoder *tr;
955 tr = load_transcoder_entry(entries[i]);
956 if (!tr)
957 return NULL;
958 }
959
960 ec = rb_econv_alloc(n);
961
962 for (i = 0; i < n; i++) {
963 const rb_transcoder *tr = load_transcoder_entry(entries[i]);
964 ret = rb_econv_add_transcoder_at(ec, tr, ec->num_trans);
965 if (ret == -1) {
966 rb_econv_close(ec);
967 return NULL;
968 }
969 }
970
971 return ec;
972}
973
975 transcoder_entry_t **entries;
976 int num_additional;
977};
978
979static void
980trans_open_i(const char *sname, const char *dname, int depth, void *arg)
981{
982 struct trans_open_t *toarg = arg;
983
984 if (!toarg->entries) {
985 toarg->entries = ALLOC_N(transcoder_entry_t *, depth+1+toarg->num_additional);
986 }
987 toarg->entries[depth] = get_transcoder_entry(sname, dname);
988}
989
990static rb_econv_t *
991rb_econv_open0(const char *sname, const char *dname, int ecflags)
992{
993 transcoder_entry_t **entries = NULL;
994 int num_trans;
995 rb_econv_t *ec;
996
997 /* Just check if sname and dname are defined */
998 /* (This check is needed?) */
999 if (*sname) rb_enc_find_index(sname);
1000 if (*dname) rb_enc_find_index(dname);
1001
1002 if (*sname == '\0' && *dname == '\0') {
1003 num_trans = 0;
1004 entries = NULL;
1005 sname = dname = "";
1006 }
1007 else {
1008 struct trans_open_t toarg;
1009 toarg.entries = NULL;
1010 toarg.num_additional = 0;
1011 num_trans = transcode_search_path(sname, dname, trans_open_i, (void *)&toarg);
1012 entries = toarg.entries;
1013 if (num_trans < 0) {
1014 xfree(entries);
1015 return NULL;
1016 }
1017 }
1018
1019 ec = rb_econv_open_by_transcoder_entries(num_trans, entries);
1020 xfree(entries);
1021 if (!ec)
1022 return NULL;
1023
1024 ec->flags = ecflags;
1025 ec->source_encoding_name = sname;
1026 ec->destination_encoding_name = dname;
1027
1028 return ec;
1029}
1030
1031#define MAX_ECFLAGS_DECORATORS 32
1032
1033static int
1034decorator_names(int ecflags, const char **decorators_ret)
1035{
1036 int num_decorators;
1037
1038 switch (ecflags & ECONV_NEWLINE_DECORATOR_MASK) {
1042 case 0:
1043 break;
1044 default:
1045 return -1;
1046 }
1047
1048 if ((ecflags & ECONV_XML_TEXT_DECORATOR) &&
1050 return -1;
1051
1052 num_decorators = 0;
1053
1054 if (ecflags & ECONV_XML_TEXT_DECORATOR)
1055 decorators_ret[num_decorators++] = "xml_text_escape";
1057 decorators_ret[num_decorators++] = "xml_attr_content_escape";
1058 if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR)
1059 decorators_ret[num_decorators++] = "xml_attr_quote";
1060
1061 if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR)
1062 decorators_ret[num_decorators++] = "crlf_newline";
1063 if (ecflags & ECONV_CR_NEWLINE_DECORATOR)
1064 decorators_ret[num_decorators++] = "cr_newline";
1066 decorators_ret[num_decorators++] = "universal_newline";
1067
1068 return num_decorators;
1069}
1070
1071rb_econv_t *
1072rb_econv_open(const char *sname, const char *dname, int ecflags)
1073{
1074 rb_econv_t *ec;
1075 int num_decorators;
1076 const char *decorators[MAX_ECFLAGS_DECORATORS];
1077 int i;
1078
1079 num_decorators = decorator_names(ecflags, decorators);
1080 if (num_decorators == -1)
1081 return NULL;
1082
1083 ec = rb_econv_open0(sname, dname, ecflags & ECONV_ERROR_HANDLER_MASK);
1084 if (!ec)
1085 return NULL;
1086
1087 for (i = 0; i < num_decorators; i++)
1088 if (rb_econv_decorate_at_last(ec, decorators[i]) == -1) {
1089 rb_econv_close(ec);
1090 return NULL;
1091 }
1092
1093 ec->flags |= ecflags & ~ECONV_ERROR_HANDLER_MASK;
1094
1095 return ec;
1096}
1097
1098static int
1099trans_sweep(rb_econv_t *ec,
1100 const unsigned char **input_ptr, const unsigned char *input_stop,
1101 unsigned char **output_ptr, unsigned char *output_stop,
1102 int flags,
1103 int start)
1104{
1105 int try;
1106 int i, f;
1107
1108 const unsigned char **ipp, *is, *iold;
1109 unsigned char **opp, *os, *oold;
1110 rb_econv_result_t res;
1111
1112 try = 1;
1113 while (try) {
1114 try = 0;
1115 for (i = start; i < ec->num_trans; i++) {
1116 rb_econv_elem_t *te = &ec->elems[i];
1117
1118 if (i == 0) {
1119 ipp = input_ptr;
1120 is = input_stop;
1121 }
1122 else {
1123 rb_econv_elem_t *prev_te = &ec->elems[i-1];
1124 ipp = (const unsigned char **)&prev_te->out_data_start;
1125 is = prev_te->out_data_end;
1126 }
1127
1128 if (i == ec->num_trans-1) {
1129 opp = output_ptr;
1130 os = output_stop;
1131 }
1132 else {
1133 if (te->out_buf_start != te->out_data_start) {
1134 ssize_t len = te->out_data_end - te->out_data_start;
1135 ssize_t off = te->out_data_start - te->out_buf_start;
1136 MEMMOVE(te->out_buf_start, te->out_data_start, unsigned char, len);
1137 te->out_data_start = te->out_buf_start;
1138 te->out_data_end -= off;
1139 }
1140 opp = &te->out_data_end;
1141 os = te->out_buf_end;
1142 }
1143
1144 f = flags;
1145 if (ec->num_finished != i)
1147 if (i == 0 && (flags & ECONV_AFTER_OUTPUT)) {
1148 start = 1;
1149 flags &= ~ECONV_AFTER_OUTPUT;
1150 }
1151 if (i != 0)
1152 f &= ~ECONV_AFTER_OUTPUT;
1153 iold = *ipp;
1154 oold = *opp;
1155 te->last_result = res = rb_transcoding_convert(te->tc, ipp, is, opp, os, f);
1156 if (iold != *ipp || oold != *opp)
1157 try = 1;
1158
1159 switch (res) {
1163 case econv_after_output:
1164 return i;
1165
1168 break;
1169
1170 case econv_finished:
1171 ec->num_finished = i+1;
1172 break;
1173 }
1174 }
1175 }
1176 return -1;
1177}
1178
1179static rb_econv_result_t
1180rb_trans_conv(rb_econv_t *ec,
1181 const unsigned char **input_ptr, const unsigned char *input_stop,
1182 unsigned char **output_ptr, unsigned char *output_stop,
1183 int flags,
1184 int *result_position_ptr)
1185{
1186 int i;
1187 int needreport_index;
1188 int sweep_start;
1189
1190 unsigned char empty_buf;
1191 unsigned char *empty_ptr = &empty_buf;
1192
1193 if (!input_ptr) {
1194 input_ptr = (const unsigned char **)&empty_ptr;
1195 input_stop = empty_ptr;
1196 }
1197
1198 if (!output_ptr) {
1199 output_ptr = &empty_ptr;
1200 output_stop = empty_ptr;
1201 }
1202
1203 if (ec->elems[0].last_result == econv_after_output)
1204 ec->elems[0].last_result = econv_source_buffer_empty;
1205
1206 for (i = ec->num_trans-1; 0 <= i; i--) {
1207 switch (ec->elems[i].last_result) {
1211 case econv_after_output:
1212 case econv_finished:
1213 sweep_start = i+1;
1214 goto found_needreport;
1215
1218 break;
1219
1220 default:
1221 rb_bug("unexpected transcode last result");
1222 }
1223 }
1224
1225 /* /^[sd]+$/ is confirmed. but actually /^s*d*$/. */
1226
1227 if (ec->elems[ec->num_trans-1].last_result == econv_destination_buffer_full &&
1228 (flags & ECONV_AFTER_OUTPUT)) {
1229 rb_econv_result_t res;
1230
1231 res = rb_trans_conv(ec, NULL, NULL, output_ptr, output_stop,
1233 result_position_ptr);
1234
1235 if (res == econv_source_buffer_empty)
1236 return econv_after_output;
1237 return res;
1238 }
1239
1240 sweep_start = 0;
1241
1242 found_needreport:
1243
1244 do {
1245 needreport_index = trans_sweep(ec, input_ptr, input_stop, output_ptr, output_stop, flags, sweep_start);
1246 sweep_start = needreport_index + 1;
1247 } while (needreport_index != -1 && needreport_index != ec->num_trans-1);
1248
1249 for (i = ec->num_trans-1; 0 <= i; i--) {
1250 if (ec->elems[i].last_result != econv_source_buffer_empty) {
1251 rb_econv_result_t res = ec->elems[i].last_result;
1252 if (res == econv_invalid_byte_sequence ||
1253 res == econv_incomplete_input ||
1255 res == econv_after_output) {
1256 ec->elems[i].last_result = econv_source_buffer_empty;
1257 }
1258 if (result_position_ptr)
1259 *result_position_ptr = i;
1260 return res;
1261 }
1262 }
1263 if (result_position_ptr)
1264 *result_position_ptr = -1;
1266}
1267
1268static rb_econv_result_t
1269rb_econv_convert0(rb_econv_t *ec,
1270 const unsigned char **input_ptr, const unsigned char *input_stop,
1271 unsigned char **output_ptr, unsigned char *output_stop,
1272 int flags)
1273{
1274 rb_econv_result_t res;
1275 int result_position;
1276 int has_output = 0;
1277
1278 memset(&ec->last_error, 0, sizeof(ec->last_error));
1279
1280 if (ec->num_trans == 0) {
1281 size_t len;
1282 if (ec->in_buf_start && ec->in_data_start != ec->in_data_end) {
1283 if (output_stop - *output_ptr < ec->in_data_end - ec->in_data_start) {
1284 len = output_stop - *output_ptr;
1285 memcpy(*output_ptr, ec->in_data_start, len);
1286 *output_ptr = output_stop;
1287 ec->in_data_start += len;
1289 goto gotresult;
1290 }
1291 len = ec->in_data_end - ec->in_data_start;
1292 memcpy(*output_ptr, ec->in_data_start, len);
1293 *output_ptr += len;
1294 ec->in_data_start = ec->in_data_end = ec->in_buf_start;
1295 if (flags & ECONV_AFTER_OUTPUT) {
1296 res = econv_after_output;
1297 goto gotresult;
1298 }
1299 }
1300 if (output_stop - *output_ptr < input_stop - *input_ptr) {
1301 len = output_stop - *output_ptr;
1302 }
1303 else {
1304 len = input_stop - *input_ptr;
1305 }
1306 if (0 < len && (flags & ECONV_AFTER_OUTPUT)) {
1307 *(*output_ptr)++ = *(*input_ptr)++;
1308 res = econv_after_output;
1309 goto gotresult;
1310 }
1311 memcpy(*output_ptr, *input_ptr, len);
1312 *output_ptr += len;
1313 *input_ptr += len;
1314 if (*input_ptr != input_stop)
1316 else if (flags & ECONV_PARTIAL_INPUT)
1318 else
1319 res = econv_finished;
1320 goto gotresult;
1321 }
1322
1323 if (ec->elems[ec->num_trans-1].out_data_start) {
1324 unsigned char *data_start = ec->elems[ec->num_trans-1].out_data_start;
1325 unsigned char *data_end = ec->elems[ec->num_trans-1].out_data_end;
1326 if (data_start != data_end) {
1327 size_t len;
1328 if (output_stop - *output_ptr < data_end - data_start) {
1329 len = output_stop - *output_ptr;
1330 memcpy(*output_ptr, data_start, len);
1331 *output_ptr = output_stop;
1332 ec->elems[ec->num_trans-1].out_data_start += len;
1334 goto gotresult;
1335 }
1336 len = data_end - data_start;
1337 memcpy(*output_ptr, data_start, len);
1338 *output_ptr += len;
1339 ec->elems[ec->num_trans-1].out_data_start =
1340 ec->elems[ec->num_trans-1].out_data_end =
1341 ec->elems[ec->num_trans-1].out_buf_start;
1342 has_output = 1;
1343 }
1344 }
1345
1346 if (ec->in_buf_start &&
1347 ec->in_data_start != ec->in_data_end) {
1348 res = rb_trans_conv(ec, (const unsigned char **)&ec->in_data_start, ec->in_data_end, output_ptr, output_stop,
1349 (flags&~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT, &result_position);
1350 if (res != econv_source_buffer_empty)
1351 goto gotresult;
1352 }
1353
1354 if (has_output &&
1355 (flags & ECONV_AFTER_OUTPUT) &&
1356 *input_ptr != input_stop) {
1357 input_stop = *input_ptr;
1358 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1359 if (res == econv_source_buffer_empty)
1360 res = econv_after_output;
1361 }
1362 else if ((flags & ECONV_AFTER_OUTPUT) ||
1363 ec->num_trans == 1) {
1364 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1365 }
1366 else {
1367 flags |= ECONV_AFTER_OUTPUT;
1368 do {
1369 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1370 } while (res == econv_after_output);
1371 }
1372
1373 gotresult:
1374 ec->last_error.result = res;
1375 if (res == econv_invalid_byte_sequence ||
1376 res == econv_incomplete_input ||
1378 rb_transcoding *error_tc = ec->elems[result_position].tc;
1379 ec->last_error.error_tc = error_tc;
1380 ec->last_error.source_encoding = error_tc->transcoder->src_encoding;
1381 ec->last_error.destination_encoding = error_tc->transcoder->dst_encoding;
1382 ec->last_error.error_bytes_start = TRANSCODING_READBUF(error_tc);
1383 ec->last_error.error_bytes_len = error_tc->recognized_len;
1384 ec->last_error.readagain_len = error_tc->readagain_len;
1385 }
1386
1387 return res;
1388}
1389
1390static int output_replacement_character(rb_econv_t *ec);
1391
1392static int
1393output_hex_charref(rb_econv_t *ec)
1394{
1395 int ret;
1396 unsigned char utfbuf[1024];
1397 const unsigned char *utf;
1398 size_t utf_len;
1399 int utf_allocated = 0;
1400 char charef_buf[16];
1401 const unsigned char *p;
1402
1403 if (encoding_equal(ec->last_error.source_encoding, "UTF-32BE")) {
1404 utf = ec->last_error.error_bytes_start;
1405 utf_len = ec->last_error.error_bytes_len;
1406 }
1407 else {
1408 utf = allocate_converted_string(ec->last_error.source_encoding, "UTF-32BE",
1409 ec->last_error.error_bytes_start, ec->last_error.error_bytes_len,
1410 utfbuf, sizeof(utfbuf),
1411 &utf_len);
1412 if (!utf)
1413 return -1;
1414 if (utf != utfbuf && utf != ec->last_error.error_bytes_start)
1415 utf_allocated = 1;
1416 }
1417
1418 if (utf_len % 4 != 0)
1419 goto fail;
1420
1421 p = utf;
1422 while (4 <= utf_len) {
1423 unsigned int u = 0;
1424 u += p[0] << 24;
1425 u += p[1] << 16;
1426 u += p[2] << 8;
1427 u += p[3];
1428 snprintf(charef_buf, sizeof(charef_buf), "&#x%X;", u);
1429
1430 ret = rb_econv_insert_output(ec, (unsigned char *)charef_buf, strlen(charef_buf), "US-ASCII");
1431 if (ret == -1)
1432 goto fail;
1433
1434 p += 4;
1435 utf_len -= 4;
1436 }
1437
1438 if (utf_allocated)
1439 xfree((void *)utf);
1440 return 0;
1441
1442 fail:
1443 if (utf_allocated)
1444 xfree((void *)utf);
1445 return -1;
1446}
1447
1448rb_econv_result_t
1450 const unsigned char **input_ptr, const unsigned char *input_stop,
1451 unsigned char **output_ptr, unsigned char *output_stop,
1452 int flags)
1453{
1454 rb_econv_result_t ret;
1455
1456 unsigned char empty_buf;
1457 unsigned char *empty_ptr = &empty_buf;
1458
1459 ec->started = 1;
1460
1461 if (!input_ptr) {
1462 input_ptr = (const unsigned char **)&empty_ptr;
1463 input_stop = empty_ptr;
1464 }
1465
1466 if (!output_ptr) {
1467 output_ptr = &empty_ptr;
1468 output_stop = empty_ptr;
1469 }
1470
1471 resume:
1472 ret = rb_econv_convert0(ec, input_ptr, input_stop, output_ptr, output_stop, flags);
1473
1474 if (ret == econv_invalid_byte_sequence ||
1475 ret == econv_incomplete_input) {
1476 /* deal with invalid byte sequence */
1477 /* todo: add more alternative behaviors */
1478 switch (ec->flags & ECONV_INVALID_MASK) {
1480 if (output_replacement_character(ec) == 0)
1481 goto resume;
1482 }
1483 }
1484
1485 if (ret == econv_undefined_conversion) {
1486 /* valid character in source encoding
1487 * but no related character(s) in destination encoding */
1488 /* todo: add more alternative behaviors */
1489 switch (ec->flags & ECONV_UNDEF_MASK) {
1491 if (output_replacement_character(ec) == 0)
1492 goto resume;
1493 break;
1494
1496 if (output_hex_charref(ec) == 0)
1497 goto resume;
1498 break;
1499 }
1500 }
1501
1502 return ret;
1503}
1504
1505const char *
1507{
1508 rb_transcoding *tc = ec->last_tc;
1509 const rb_transcoder *tr;
1510
1511 if (tc == NULL)
1512 return "";
1513
1514 tr = tc->transcoder;
1515
1516 if (tr->asciicompat_type == asciicompat_encoder)
1517 return tr->src_encoding;
1518 return tr->dst_encoding;
1519}
1520
1521static unsigned char *
1522allocate_converted_string(const char *sname, const char *dname,
1523 const unsigned char *str, size_t len,
1524 unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
1525 size_t *dst_len_ptr)
1526{
1527 unsigned char *dst_str;
1528 size_t dst_len;
1529 size_t dst_bufsize;
1530
1531 rb_econv_t *ec;
1532 rb_econv_result_t res;
1533
1534 const unsigned char *sp;
1535 unsigned char *dp;
1536
1537 if (caller_dst_buf)
1538 dst_bufsize = caller_dst_bufsize;
1539 else if (len == 0)
1540 dst_bufsize = 1;
1541 else
1542 dst_bufsize = len;
1543
1544 ec = rb_econv_open(sname, dname, 0);
1545 if (ec == NULL)
1546 return NULL;
1547 if (caller_dst_buf)
1548 dst_str = caller_dst_buf;
1549 else
1550 dst_str = xmalloc(dst_bufsize);
1551 dst_len = 0;
1552 sp = str;
1553 dp = dst_str+dst_len;
1554 res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
1555 dst_len = dp - dst_str;
1556 while (res == econv_destination_buffer_full) {
1557 if (SIZE_MAX/2 < dst_bufsize) {
1558 goto fail;
1559 }
1560 dst_bufsize *= 2;
1561 if (dst_str == caller_dst_buf) {
1562 unsigned char *tmp;
1563 tmp = xmalloc(dst_bufsize);
1564 memcpy(tmp, dst_str, dst_bufsize/2);
1565 dst_str = tmp;
1566 }
1567 else {
1568 dst_str = xrealloc(dst_str, dst_bufsize);
1569 }
1570 dp = dst_str+dst_len;
1571 res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
1572 dst_len = dp - dst_str;
1573 }
1574 if (res != econv_finished) {
1575 goto fail;
1576 }
1577 rb_econv_close(ec);
1578 *dst_len_ptr = dst_len;
1579 return dst_str;
1580
1581 fail:
1582 if (dst_str != caller_dst_buf)
1583 xfree(dst_str);
1584 rb_econv_close(ec);
1585 return NULL;
1586}
1587
1588/* result: 0:success -1:failure */
1589int
1591 const unsigned char *str, size_t len, const char *str_encoding)
1592{
1593 const char *insert_encoding = rb_econv_encoding_to_insert_output(ec);
1594 unsigned char insert_buf[4096];
1595 const unsigned char *insert_str = NULL;
1596 size_t insert_len;
1597
1598 int last_trans_index;
1599 rb_transcoding *tc;
1600
1601 unsigned char **buf_start_p;
1602 unsigned char **data_start_p;
1603 unsigned char **data_end_p;
1604 unsigned char **buf_end_p;
1605
1606 size_t need;
1607
1608 ec->started = 1;
1609
1610 if (len == 0)
1611 return 0;
1612
1613 if (encoding_equal(insert_encoding, str_encoding)) {
1614 insert_str = str;
1615 insert_len = len;
1616 }
1617 else {
1618 insert_str = allocate_converted_string(str_encoding, insert_encoding,
1619 str, len, insert_buf, sizeof(insert_buf), &insert_len);
1620 if (insert_str == NULL)
1621 return -1;
1622 }
1623
1624 need = insert_len;
1625
1626 last_trans_index = ec->num_trans-1;
1627 if (ec->num_trans == 0) {
1628 tc = NULL;
1629 buf_start_p = &ec->in_buf_start;
1630 data_start_p = &ec->in_data_start;
1631 data_end_p = &ec->in_data_end;
1632 buf_end_p = &ec->in_buf_end;
1633 }
1634 else if (ec->elems[last_trans_index].tc->transcoder->asciicompat_type == asciicompat_encoder) {
1635 tc = ec->elems[last_trans_index].tc;
1636 need += tc->readagain_len;
1637 if (need < insert_len)
1638 goto fail;
1639 if (last_trans_index == 0) {
1640 buf_start_p = &ec->in_buf_start;
1641 data_start_p = &ec->in_data_start;
1642 data_end_p = &ec->in_data_end;
1643 buf_end_p = &ec->in_buf_end;
1644 }
1645 else {
1646 rb_econv_elem_t *ee = &ec->elems[last_trans_index-1];
1647 buf_start_p = &ee->out_buf_start;
1648 data_start_p = &ee->out_data_start;
1649 data_end_p = &ee->out_data_end;
1650 buf_end_p = &ee->out_buf_end;
1651 }
1652 }
1653 else {
1654 rb_econv_elem_t *ee = &ec->elems[last_trans_index];
1655 buf_start_p = &ee->out_buf_start;
1656 data_start_p = &ee->out_data_start;
1657 data_end_p = &ee->out_data_end;
1658 buf_end_p = &ee->out_buf_end;
1659 tc = ec->elems[last_trans_index].tc;
1660 }
1661
1662 if (*buf_start_p == NULL) {
1663 unsigned char *buf = xmalloc(need);
1664 *buf_start_p = buf;
1665 *data_start_p = buf;
1666 *data_end_p = buf;
1667 *buf_end_p = buf+need;
1668 }
1669 else if ((size_t)(*buf_end_p - *data_end_p) < need) {
1670 MEMMOVE(*buf_start_p, *data_start_p, unsigned char, *data_end_p - *data_start_p);
1671 *data_end_p = *buf_start_p + (*data_end_p - *data_start_p);
1672 *data_start_p = *buf_start_p;
1673 if ((size_t)(*buf_end_p - *data_end_p) < need) {
1674 unsigned char *buf;
1675 size_t s = (*data_end_p - *buf_start_p) + need;
1676 if (s < need)
1677 goto fail;
1678 buf = xrealloc(*buf_start_p, s);
1679 *data_start_p = buf;
1680 *data_end_p = buf + (*data_end_p - *buf_start_p);
1681 *buf_start_p = buf;
1682 *buf_end_p = buf + s;
1683 }
1684 }
1685
1686 memcpy(*data_end_p, insert_str, insert_len);
1687 *data_end_p += insert_len;
1688 if (tc && tc->transcoder->asciicompat_type == asciicompat_encoder) {
1689 memcpy(*data_end_p, TRANSCODING_READBUF(tc)+tc->recognized_len, tc->readagain_len);
1690 *data_end_p += tc->readagain_len;
1691 tc->readagain_len = 0;
1692 }
1693
1694 if (insert_str != str && insert_str != insert_buf)
1695 xfree((void*)insert_str);
1696 return 0;
1697
1698 fail:
1699 if (insert_str != str && insert_str != insert_buf)
1700 xfree((void*)insert_str);
1701 return -1;
1702}
1703
1704void
1706{
1707 int i;
1708
1709 if (ec->replacement_allocated) {
1710 xfree((void *)ec->replacement_str);
1711 }
1712 for (i = 0; i < ec->num_trans; i++) {
1713 rb_transcoding_close(ec->elems[i].tc);
1714 if (ec->elems[i].out_buf_start)
1715 xfree(ec->elems[i].out_buf_start);
1716 }
1717 xfree(ec->in_buf_start);
1718 xfree(ec->elems);
1719 xfree(ec);
1720}
1721
1722size_t
1723rb_econv_memsize(rb_econv_t *ec)
1724{
1725 size_t size = sizeof(rb_econv_t);
1726 int i;
1727
1728 if (ec->replacement_allocated) {
1729 size += ec->replacement_len;
1730 }
1731 for (i = 0; i < ec->num_trans; i++) {
1732 size += rb_transcoding_memsize(ec->elems[i].tc);
1733
1734 if (ec->elems[i].out_buf_start) {
1735 size += ec->elems[i].out_buf_end - ec->elems[i].out_buf_start;
1736 }
1737 }
1738 size += ec->in_buf_end - ec->in_buf_start;
1739 size += sizeof(rb_econv_elem_t) * ec->num_allocated;
1740
1741 return size;
1742}
1743
1744int
1746{
1747 if (ec->num_trans == 0)
1748 return 0;
1749#if SIZEOF_SIZE_T > SIZEOF_INT
1750 if (ec->elems[0].tc->readagain_len > INT_MAX) return INT_MAX;
1751#endif
1752 return (int)ec->elems[0].tc->readagain_len;
1753}
1754
1755void
1756rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n)
1757{
1758 rb_transcoding *tc;
1759 if (ec->num_trans == 0 || n == 0)
1760 return;
1761 tc = ec->elems[0].tc;
1762 memcpy(p, TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len - n, n);
1763 tc->readagain_len -= n;
1764}
1765
1767 const char *ascii_compat_name;
1768 const char *ascii_incompat_name;
1769};
1770
1771static int
1772asciicompat_encoding_i(st_data_t key, st_data_t val, st_data_t arg)
1773{
1774 struct asciicompat_encoding_t *data = (struct asciicompat_encoding_t *)arg;
1775 transcoder_entry_t *entry = (transcoder_entry_t *)val;
1776 const rb_transcoder *tr;
1777
1778 if (DECORATOR_P(entry->sname, entry->dname))
1779 return ST_CONTINUE;
1780 tr = load_transcoder_entry(entry);
1781 if (tr && tr->asciicompat_type == asciicompat_decoder) {
1782 data->ascii_compat_name = tr->dst_encoding;
1783 return ST_STOP;
1784 }
1785 return ST_CONTINUE;
1786}
1787
1788const char *
1789rb_econv_asciicompat_encoding(const char *ascii_incompat_name)
1790{
1791 st_data_t v;
1792 st_table *table2;
1793 struct asciicompat_encoding_t data;
1794
1795 if (!st_lookup(transcoder_table, (st_data_t)ascii_incompat_name, &v))
1796 return NULL;
1797 table2 = (st_table *)v;
1798
1799 /*
1800 * Assumption:
1801 * There is at most one transcoder for
1802 * converting from ASCII incompatible encoding.
1803 *
1804 * For ISO-2022-JP, there is ISO-2022-JP -> stateless-ISO-2022-JP and no others.
1805 */
1806 if (table2->num_entries != 1)
1807 return NULL;
1808
1809 data.ascii_incompat_name = ascii_incompat_name;
1810 data.ascii_compat_name = NULL;
1811 st_foreach(table2, asciicompat_encoding_i, (st_data_t)&data);
1812 return data.ascii_compat_name;
1813}
1814
1815/*
1816 * Append `len` bytes pointed by `ss` to `dst` with converting with `ec`.
1817 *
1818 * If the result of the conversion is not compatible with the encoding of
1819 * `dst`, `dst` may not be valid encoding.
1820 */
1821VALUE
1822rb_econv_append(rb_econv_t *ec, const char *ss, long len, VALUE dst, int flags)
1823{
1824 unsigned const char *sp, *se;
1825 unsigned char *ds, *dp, *de;
1826 rb_econv_result_t res;
1827 int max_output;
1828 enum ruby_coderange_type coderange;
1829 rb_encoding *dst_enc = ec->destination_encoding;
1830
1831 if (NIL_P(dst)) {
1832 dst = rb_str_buf_new(len);
1833 if (dst_enc) {
1834 rb_enc_associate(dst, dst_enc);
1835 }
1836 coderange = ENC_CODERANGE_7BIT; // scan from the start
1837 }
1838 else {
1839 dst_enc = rb_enc_get(dst);
1840 coderange = rb_enc_str_coderange(dst);
1841 }
1842
1843 if (ec->last_tc)
1844 max_output = ec->last_tc->transcoder->max_output;
1845 else
1846 max_output = 1;
1847
1848 do {
1849 int cr;
1850 long dlen = RSTRING_LEN(dst);
1851 if (rb_str_capacity(dst) - dlen < (size_t)len + max_output) {
1852 unsigned long new_capa = (unsigned long)dlen + len + max_output;
1853 if (LONG_MAX < new_capa)
1854 rb_raise(rb_eArgError, "too long string");
1855 rb_str_modify_expand(dst, new_capa - dlen);
1856 }
1857 sp = (const unsigned char *)ss;
1858 se = sp + len;
1859 ds = (unsigned char *)RSTRING_PTR(dst);
1860 de = ds + rb_str_capacity(dst);
1861 dp = ds += dlen;
1862 res = rb_econv_convert(ec, &sp, se, &dp, de, flags);
1863 switch (coderange) {
1864 case ENC_CODERANGE_7BIT:
1866 cr = (int)coderange;
1867 rb_str_coderange_scan_restartable((char *)ds, (char *)dp, dst_enc, &cr);
1868 coderange = cr;
1869 ENC_CODERANGE_SET(dst, coderange);
1870 break;
1873 break;
1874 }
1875 len -= (const char *)sp - ss;
1876 ss = (const char *)sp;
1877 rb_str_set_len(dst, dlen + (dp - ds));
1879 } while (res == econv_destination_buffer_full);
1880
1881 return dst;
1882}
1883
1884VALUE
1885rb_econv_substr_append(rb_econv_t *ec, VALUE src, long off, long len, VALUE dst, int flags)
1886{
1887 src = rb_str_new_frozen(src);
1888 dst = rb_econv_append(ec, RSTRING_PTR(src) + off, len, dst, flags);
1889 RB_GC_GUARD(src);
1890 return dst;
1891}
1892
1893VALUE
1894rb_econv_str_append(rb_econv_t *ec, VALUE src, VALUE dst, int flags)
1895{
1896 return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), dst, flags);
1897}
1898
1899VALUE
1900rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags)
1901{
1902 return rb_econv_substr_append(ec, src, byteoff, bytesize, Qnil, flags);
1903}
1904
1905VALUE
1906rb_econv_str_convert(rb_econv_t *ec, VALUE src, int flags)
1907{
1908 return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), Qnil, flags);
1909}
1910
1911static int
1912rb_econv_add_converter(rb_econv_t *ec, const char *sname, const char *dname, int n)
1913{
1914 transcoder_entry_t *entry;
1915 const rb_transcoder *tr;
1916
1917 if (ec->started != 0)
1918 return -1;
1919
1920 entry = get_transcoder_entry(sname, dname);
1921 if (!entry)
1922 return -1;
1923
1924 tr = load_transcoder_entry(entry);
1925 if (!tr) return -1;
1926
1927 return rb_econv_add_transcoder_at(ec, tr, n);
1928}
1929
1930static int
1931rb_econv_decorate_at(rb_econv_t *ec, const char *decorator_name, int n)
1932{
1933 return rb_econv_add_converter(ec, "", decorator_name, n);
1934}
1935
1936int
1937rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name)
1938{
1939 const rb_transcoder *tr;
1940
1941 if (ec->num_trans == 0)
1942 return rb_econv_decorate_at(ec, decorator_name, 0);
1943
1944 tr = ec->elems[0].tc->transcoder;
1945
1946 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
1947 tr->asciicompat_type == asciicompat_decoder)
1948 return rb_econv_decorate_at(ec, decorator_name, 1);
1949
1950 return rb_econv_decorate_at(ec, decorator_name, 0);
1951}
1952
1953int
1954rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name)
1955{
1956 const rb_transcoder *tr;
1957
1958 if (ec->num_trans == 0)
1959 return rb_econv_decorate_at(ec, decorator_name, 0);
1960
1961 tr = ec->elems[ec->num_trans-1].tc->transcoder;
1962
1963 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
1964 tr->asciicompat_type == asciicompat_encoder)
1965 return rb_econv_decorate_at(ec, decorator_name, ec->num_trans-1);
1966
1967 return rb_econv_decorate_at(ec, decorator_name, ec->num_trans);
1968}
1969
1970void
1972{
1973 const char *dname = 0;
1974
1975 switch (ec->flags & ECONV_NEWLINE_DECORATOR_MASK) {
1977 dname = "universal_newline";
1978 break;
1980 dname = "crlf_newline";
1981 break;
1983 dname = "cr_newline";
1984 break;
1985 }
1986
1987 if (dname) {
1988 const rb_transcoder *transcoder = get_transcoder_entry("", dname)->transcoder;
1989 int num_trans = ec->num_trans;
1990 int i, j = 0;
1991
1992 for (i=0; i < num_trans; i++) {
1993 if (transcoder == ec->elems[i].tc->transcoder) {
1994 rb_transcoding_close(ec->elems[i].tc);
1995 xfree(ec->elems[i].out_buf_start);
1996 ec->num_trans--;
1997 }
1998 else
1999 ec->elems[j++] = ec->elems[i];
2000 }
2001 }
2002
2003 ec->flags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2004}
2005
2006static VALUE
2007econv_description(const char *sname, const char *dname, int ecflags, VALUE mesg)
2008{
2009 int has_description = 0;
2010
2011 if (NIL_P(mesg))
2012 mesg = rb_str_new(NULL, 0);
2013
2014 if (*sname != '\0' || *dname != '\0') {
2015 if (*sname == '\0')
2016 rb_str_cat2(mesg, dname);
2017 else if (*dname == '\0')
2018 rb_str_cat2(mesg, sname);
2019 else
2020 rb_str_catf(mesg, "%s to %s", sname, dname);
2021 has_description = 1;
2022 }
2023
2024 if (ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
2028 const char *pre = "";
2029 if (has_description)
2030 rb_str_cat2(mesg, " with ");
2031 if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR) {
2032 rb_str_cat2(mesg, pre); pre = ",";
2033 rb_str_cat2(mesg, "universal_newline");
2034 }
2035 if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR) {
2036 rb_str_cat2(mesg, pre); pre = ",";
2037 rb_str_cat2(mesg, "crlf_newline");
2038 }
2039 if (ecflags & ECONV_CR_NEWLINE_DECORATOR) {
2040 rb_str_cat2(mesg, pre); pre = ",";
2041 rb_str_cat2(mesg, "cr_newline");
2042 }
2043 if (ecflags & ECONV_XML_TEXT_DECORATOR) {
2044 rb_str_cat2(mesg, pre); pre = ",";
2045 rb_str_cat2(mesg, "xml_text");
2046 }
2047 if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR) {
2048 rb_str_cat2(mesg, pre); pre = ",";
2049 rb_str_cat2(mesg, "xml_attr_content");
2050 }
2051 if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR) {
2052 rb_str_cat2(mesg, pre); pre = ",";
2053 rb_str_cat2(mesg, "xml_attr_quote");
2054 }
2055 has_description = 1;
2056 }
2057 if (!has_description) {
2058 rb_str_cat2(mesg, "no-conversion");
2059 }
2060
2061 return mesg;
2062}
2063
2064VALUE
2065rb_econv_open_exc(const char *sname, const char *dname, int ecflags)
2066{
2067 VALUE mesg, exc;
2068 mesg = rb_str_new_cstr("code converter not found (");
2069 econv_description(sname, dname, ecflags, mesg);
2070 rb_str_cat2(mesg, ")");
2071 exc = rb_exc_new3(rb_eConverterNotFoundError, mesg);
2072 return exc;
2073}
2074
2075static VALUE
2076make_econv_exception(rb_econv_t *ec)
2077{
2078 VALUE mesg, exc;
2079 if (ec->last_error.result == econv_invalid_byte_sequence ||
2080 ec->last_error.result == econv_incomplete_input) {
2081 const char *err = (const char *)ec->last_error.error_bytes_start;
2082 size_t error_len = ec->last_error.error_bytes_len;
2083 VALUE bytes = rb_str_new(err, error_len);
2084 VALUE dumped = rb_str_dump(bytes);
2085 size_t readagain_len = ec->last_error.readagain_len;
2086 VALUE bytes2 = Qnil;
2087 VALUE dumped2;
2088 if (ec->last_error.result == econv_incomplete_input) {
2089 mesg = rb_sprintf("incomplete %s on %s",
2090 StringValueCStr(dumped),
2091 ec->last_error.source_encoding);
2092 }
2093 else if (readagain_len) {
2094 bytes2 = rb_str_new(err+error_len, readagain_len);
2095 dumped2 = rb_str_dump(bytes2);
2096 mesg = rb_sprintf("%s followed by %s on %s",
2097 StringValueCStr(dumped),
2098 StringValueCStr(dumped2),
2099 ec->last_error.source_encoding);
2100 }
2101 else {
2102 mesg = rb_sprintf("%s on %s",
2103 StringValueCStr(dumped),
2104 ec->last_error.source_encoding);
2105 }
2106
2107 exc = rb_exc_new3(rb_eInvalidByteSequenceError, mesg);
2108 rb_ivar_set(exc, id_error_bytes, bytes);
2109 rb_ivar_set(exc, id_readagain_bytes, bytes2);
2110 rb_ivar_set(exc, id_incomplete_input, RBOOL(ec->last_error.result == econv_incomplete_input));
2111 goto set_encs;
2112 }
2113 if (ec->last_error.result == econv_undefined_conversion) {
2114 VALUE bytes = rb_str_new((const char *)ec->last_error.error_bytes_start,
2115 ec->last_error.error_bytes_len);
2116 VALUE dumped = Qnil;
2117 int idx;
2118 if (strcmp(ec->last_error.source_encoding, "UTF-8") == 0) {
2119 rb_encoding *utf8 = rb_utf8_encoding();
2120 const char *start, *end;
2121 int n;
2122 start = (const char *)ec->last_error.error_bytes_start;
2123 end = start + ec->last_error.error_bytes_len;
2124 n = rb_enc_precise_mbclen(start, end, utf8);
2125 if (MBCLEN_CHARFOUND_P(n) &&
2126 (size_t)MBCLEN_CHARFOUND_LEN(n) == ec->last_error.error_bytes_len) {
2127 unsigned int cc = rb_enc_mbc_to_codepoint(start, end, utf8);
2128 dumped = rb_sprintf("U+%04X", cc);
2129 }
2130 }
2131 if (NIL_P(dumped))
2132 dumped = rb_str_dump(bytes);
2133 if (strcmp(ec->last_error.source_encoding,
2134 ec->source_encoding_name) == 0 &&
2135 strcmp(ec->last_error.destination_encoding,
2136 ec->destination_encoding_name) == 0) {
2137 mesg = rb_sprintf("%s from %s to %s",
2138 StringValueCStr(dumped),
2139 ec->last_error.source_encoding,
2140 ec->last_error.destination_encoding);
2141 }
2142 else {
2143 int i;
2144 mesg = rb_sprintf("%s to %s in conversion from %s",
2145 StringValueCStr(dumped),
2146 ec->last_error.destination_encoding,
2147 ec->source_encoding_name);
2148 for (i = 0; i < ec->num_trans; i++) {
2149 const rb_transcoder *tr = ec->elems[i].tc->transcoder;
2150 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
2151 rb_str_catf(mesg, " to %s",
2152 ec->elems[i].tc->transcoder->dst_encoding);
2153 }
2154 }
2155 exc = rb_exc_new3(rb_eUndefinedConversionError, mesg);
2156 idx = rb_enc_find_index(ec->last_error.source_encoding);
2157 if (0 <= idx)
2158 rb_enc_associate_index(bytes, idx);
2159 rb_ivar_set(exc, id_error_char, bytes);
2160 goto set_encs;
2161 }
2162 return Qnil;
2163
2164 set_encs:
2165 rb_ivar_set(exc, id_source_encoding_name, rb_str_new2(ec->last_error.source_encoding));
2166 rb_ivar_set(exc, id_destination_encoding_name, rb_str_new2(ec->last_error.destination_encoding));
2167 int idx = rb_enc_find_index(ec->last_error.source_encoding);
2168 if (0 <= idx)
2169 rb_ivar_set(exc, id_source_encoding, rb_enc_from_encoding(rb_enc_from_index(idx)));
2170 idx = rb_enc_find_index(ec->last_error.destination_encoding);
2171 if (0 <= idx)
2172 rb_ivar_set(exc, id_destination_encoding, rb_enc_from_encoding(rb_enc_from_index(idx)));
2173 return exc;
2174}
2175
2176static void
2177more_output_buffer(
2178 VALUE destination,
2179 unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2180 int max_output,
2181 unsigned char **out_start_ptr,
2182 unsigned char **out_pos,
2183 unsigned char **out_stop_ptr)
2184{
2185 size_t len = (*out_pos - *out_start_ptr);
2186 size_t new_len = (len + max_output) * 2;
2187 *out_start_ptr = resize_destination(destination, len, new_len);
2188 *out_pos = *out_start_ptr + len;
2189 *out_stop_ptr = *out_start_ptr + new_len;
2190}
2191
2192static int
2193make_replacement(rb_econv_t *ec)
2194{
2195 rb_transcoding *tc;
2196 const rb_transcoder *tr;
2197 const unsigned char *replacement;
2198 const char *repl_enc;
2199 const char *ins_enc;
2200 size_t len;
2201
2202 if (ec->replacement_str)
2203 return 0;
2204
2206
2207 tc = ec->last_tc;
2208 if (*ins_enc) {
2209 tr = tc->transcoder;
2210 rb_enc_find(tr->dst_encoding);
2211 replacement = (const unsigned char *)get_replacement_character(ins_enc, &len, &repl_enc);
2212 }
2213 else {
2214 replacement = (unsigned char *)"?";
2215 len = 1;
2216 repl_enc = "";
2217 }
2218
2219 ec->replacement_str = replacement;
2220 ec->replacement_len = len;
2221 ec->replacement_enc = repl_enc;
2222 ec->replacement_allocated = 0;
2223 return 0;
2224}
2225
2226int
2228 const unsigned char *str, size_t len, const char *encname)
2229{
2230 unsigned char *str2;
2231 size_t len2;
2232 const char *encname2;
2233
2235
2236 if (!*encname2 || encoding_equal(encname, encname2)) {
2237 str2 = xmalloc(len);
2238 MEMCPY(str2, str, unsigned char, len); /* xxx: str may be invalid */
2239 len2 = len;
2240 encname2 = encname;
2241 }
2242 else {
2243 str2 = allocate_converted_string(encname, encname2, str, len, NULL, 0, &len2);
2244 if (!str2)
2245 return -1;
2246 }
2247
2248 if (ec->replacement_allocated) {
2249 xfree((void *)ec->replacement_str);
2250 }
2251 ec->replacement_allocated = 1;
2252 ec->replacement_str = str2;
2253 ec->replacement_len = len2;
2254 ec->replacement_enc = encname2;
2255 return 0;
2256}
2257
2258static int
2259output_replacement_character(rb_econv_t *ec)
2260{
2261 int ret;
2262
2263 if (make_replacement(ec) == -1)
2264 return -1;
2265
2266 ret = rb_econv_insert_output(ec, ec->replacement_str, ec->replacement_len, ec->replacement_enc);
2267 if (ret == -1)
2268 return -1;
2269
2270 return 0;
2271}
2272
2273#if 1
2274#define hash_fallback rb_hash_aref
2275
2276static VALUE
2277proc_fallback(VALUE fallback, VALUE c)
2278{
2279 return rb_proc_call(fallback, rb_ary_new4(1, &c));
2280}
2281
2282static VALUE
2283method_fallback(VALUE fallback, VALUE c)
2284{
2285 return rb_method_call(1, &c, fallback);
2286}
2287
2288static VALUE
2289aref_fallback(VALUE fallback, VALUE c)
2290{
2291 return rb_funcallv_public(fallback, idAREF, 1, &c);
2292}
2293
2294static void
2295transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
2296 const unsigned char *in_stop, unsigned char *out_stop,
2297 VALUE destination,
2298 unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2299 const char *src_encoding,
2300 const char *dst_encoding,
2301 int ecflags,
2302 VALUE ecopts)
2303{
2304 rb_econv_t *ec;
2305 rb_transcoding *last_tc;
2306 rb_econv_result_t ret;
2307 unsigned char *out_start = *out_pos;
2308 int max_output;
2309 VALUE exc;
2310 VALUE fallback = Qnil;
2311 VALUE (*fallback_func)(VALUE, VALUE) = 0;
2312
2313 ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
2314 if (!ec)
2315 rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
2316
2317 if (!NIL_P(ecopts) && RB_TYPE_P(ecopts, T_HASH)) {
2318 fallback = rb_hash_aref(ecopts, sym_fallback);
2319 if (RB_TYPE_P(fallback, T_HASH)) {
2320 fallback_func = hash_fallback;
2321 }
2322 else if (rb_obj_is_proc(fallback)) {
2323 fallback_func = proc_fallback;
2324 }
2325 else if (rb_obj_is_method(fallback)) {
2326 fallback_func = method_fallback;
2327 }
2328 else {
2329 fallback_func = aref_fallback;
2330 }
2331 }
2332 last_tc = ec->last_tc;
2333 max_output = last_tc ? last_tc->transcoder->max_output : 1;
2334
2335 resume:
2336 ret = rb_econv_convert(ec, in_pos, in_stop, out_pos, out_stop, 0);
2337
2338 if (!NIL_P(fallback) && ret == econv_undefined_conversion) {
2339 VALUE rep = rb_enc_str_new(
2340 (const char *)ec->last_error.error_bytes_start,
2341 ec->last_error.error_bytes_len,
2342 rb_enc_find(ec->last_error.source_encoding));
2343 rep = (*fallback_func)(fallback, rep);
2344 if (rep != Qundef && !NIL_P(rep)) {
2345 StringValue(rep);
2346 ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(rep),
2347 RSTRING_LEN(rep), rb_enc_name(rb_enc_get(rep)));
2348 if ((int)ret == -1) {
2349 rb_raise(rb_eArgError, "too big fallback string");
2350 }
2351 goto resume;
2352 }
2353 }
2354
2355 if (ret == econv_invalid_byte_sequence ||
2356 ret == econv_incomplete_input ||
2358 exc = make_econv_exception(ec);
2359 rb_econv_close(ec);
2360 rb_exc_raise(exc);
2361 }
2362
2363 if (ret == econv_destination_buffer_full) {
2364 more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
2365 goto resume;
2366 }
2367
2368 rb_econv_close(ec);
2369 return;
2370}
2371#else
2372/* sample transcode_loop implementation in byte-by-byte stream style */
2373static void
2374transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
2375 const unsigned char *in_stop, unsigned char *out_stop,
2376 VALUE destination,
2377 unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2378 const char *src_encoding,
2379 const char *dst_encoding,
2380 int ecflags,
2381 VALUE ecopts)
2382{
2383 rb_econv_t *ec;
2384 rb_transcoding *last_tc;
2385 rb_econv_result_t ret;
2386 unsigned char *out_start = *out_pos;
2387 const unsigned char *ptr;
2388 int max_output;
2389 VALUE exc;
2390
2391 ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
2392 if (!ec)
2393 rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
2394
2395 last_tc = ec->last_tc;
2396 max_output = last_tc ? last_tc->transcoder->max_output : 1;
2397
2399 ptr = *in_pos;
2400 while (ret != econv_finished) {
2401 unsigned char input_byte;
2402 const unsigned char *p = &input_byte;
2403
2404 if (ret == econv_source_buffer_empty) {
2405 if (ptr < in_stop) {
2406 input_byte = *ptr;
2407 ret = rb_econv_convert(ec, &p, p+1, out_pos, out_stop, ECONV_PARTIAL_INPUT);
2408 }
2409 else {
2410 ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, 0);
2411 }
2412 }
2413 else {
2414 ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, ECONV_PARTIAL_INPUT);
2415 }
2416 if (&input_byte != p)
2417 ptr += p - &input_byte;
2418 switch (ret) {
2422 exc = make_econv_exception(ec);
2423 rb_econv_close(ec);
2424 rb_exc_raise(exc);
2425 break;
2426
2428 more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
2429 break;
2430
2432 break;
2433
2434 case econv_finished:
2435 break;
2436 }
2437 }
2438 rb_econv_close(ec);
2439 *in_pos = in_stop;
2440 return;
2441}
2442#endif
2443
2444
2445/*
2446 * String-specific code
2447 */
2448
2449static unsigned char *
2450str_transcoding_resize(VALUE destination, size_t len, size_t new_len)
2451{
2452 rb_str_resize(destination, new_len);
2453 return (unsigned char *)RSTRING_PTR(destination);
2454}
2455
2456static int
2457econv_opts(VALUE opt, int ecflags)
2458{
2459 VALUE v;
2460 int newlineflag = 0;
2461
2462 v = rb_hash_aref(opt, sym_invalid);
2463 if (NIL_P(v)) {
2464 }
2465 else if (v==sym_replace) {
2466 ecflags |= ECONV_INVALID_REPLACE;
2467 }
2468 else {
2469 rb_raise(rb_eArgError, "unknown value for invalid character option");
2470 }
2471
2472 v = rb_hash_aref(opt, sym_undef);
2473 if (NIL_P(v)) {
2474 }
2475 else if (v==sym_replace) {
2476 ecflags |= ECONV_UNDEF_REPLACE;
2477 }
2478 else {
2479 rb_raise(rb_eArgError, "unknown value for undefined character option");
2480 }
2481
2482 v = rb_hash_aref(opt, sym_replace);
2483 if (!NIL_P(v) && !(ecflags & ECONV_INVALID_REPLACE)) {
2484 ecflags |= ECONV_UNDEF_REPLACE;
2485 }
2486
2487 v = rb_hash_aref(opt, sym_xml);
2488 if (!NIL_P(v)) {
2489 if (v==sym_text) {
2491 }
2492 else if (v==sym_attr) {
2494 }
2495 else if (SYMBOL_P(v)) {
2496 rb_raise(rb_eArgError, "unexpected value for xml option: %"PRIsVALUE, rb_sym2str(v));
2497 }
2498 else {
2499 rb_raise(rb_eArgError, "unexpected value for xml option");
2500 }
2501 }
2502
2503#ifdef ENABLE_ECONV_NEWLINE_OPTION
2504 v = rb_hash_aref(opt, sym_newline);
2505 if (!NIL_P(v)) {
2506 newlineflag = 2;
2507 ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2508 if (v == sym_universal) {
2510 }
2511 else if (v == sym_crlf) {
2513 }
2514 else if (v == sym_cr) {
2515 ecflags |= ECONV_CR_NEWLINE_DECORATOR;
2516 }
2517 else if (v == sym_lf) {
2518 /* ecflags |= ECONV_LF_NEWLINE_DECORATOR; */
2519 }
2520 else if (SYMBOL_P(v)) {
2521 rb_raise(rb_eArgError, "unexpected value for newline option: %"PRIsVALUE,
2522 rb_sym2str(v));
2523 }
2524 else {
2525 rb_raise(rb_eArgError, "unexpected value for newline option");
2526 }
2527 }
2528#endif
2529 {
2530 int setflags = 0;
2531
2532 v = rb_hash_aref(opt, sym_universal_newline);
2533 if (RTEST(v))
2535 newlineflag |= !NIL_P(v);
2536
2537 v = rb_hash_aref(opt, sym_crlf_newline);
2538 if (RTEST(v))
2539 setflags |= ECONV_CRLF_NEWLINE_DECORATOR;
2540 newlineflag |= !NIL_P(v);
2541
2542 v = rb_hash_aref(opt, sym_cr_newline);
2543 if (RTEST(v))
2544 setflags |= ECONV_CR_NEWLINE_DECORATOR;
2545 newlineflag |= !NIL_P(v);
2546
2547 switch (newlineflag) {
2548 case 1:
2549 ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2550 ecflags |= setflags;
2551 break;
2552
2553 case 3:
2554 rb_warning(":newline option precedes other newline options");
2555 break;
2556 }
2557 }
2558
2559 return ecflags;
2560}
2561
2562int
2563rb_econv_prepare_options(VALUE opthash, VALUE *opts, int ecflags)
2564{
2565 VALUE newhash = Qnil;
2566 VALUE v;
2567
2568 if (NIL_P(opthash)) {
2569 *opts = Qnil;
2570 return ecflags;
2571 }
2572 ecflags = econv_opts(opthash, ecflags);
2573
2574 v = rb_hash_aref(opthash, sym_replace);
2575 if (!NIL_P(v)) {
2576 StringValue(v);
2578 VALUE dumped = rb_str_dump(v);
2579 rb_raise(rb_eArgError, "replacement string is broken: %s as %s",
2580 StringValueCStr(dumped),
2582 }
2583 v = rb_str_new_frozen(v);
2584 newhash = rb_hash_new();
2585 rb_hash_aset(newhash, sym_replace, v);
2586 }
2587
2588 v = rb_hash_aref(opthash, sym_fallback);
2589 if (!NIL_P(v)) {
2590 VALUE h = rb_check_hash_type(v);
2591 if (NIL_P(h)
2592 ? (rb_obj_is_proc(v) || rb_obj_is_method(v) || rb_respond_to(v, idAREF))
2593 : (v = h, 1)) {
2594 if (NIL_P(newhash))
2595 newhash = rb_hash_new();
2596 rb_hash_aset(newhash, sym_fallback, v);
2597 }
2598 }
2599
2600 if (!NIL_P(newhash))
2601 rb_hash_freeze(newhash);
2602 *opts = newhash;
2603
2604 return ecflags;
2605}
2606
2607int
2608rb_econv_prepare_opts(VALUE opthash, VALUE *opts)
2609{
2610 return rb_econv_prepare_options(opthash, opts, 0);
2611}
2612
2613rb_econv_t *
2614rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE opthash)
2615{
2616 rb_econv_t *ec;
2617 VALUE replacement;
2618
2619 if (NIL_P(opthash)) {
2620 replacement = Qnil;
2621 }
2622 else {
2623 if (!RB_TYPE_P(opthash, T_HASH) || !OBJ_FROZEN(opthash))
2624 rb_bug("rb_econv_open_opts called with invalid opthash");
2625 replacement = rb_hash_aref(opthash, sym_replace);
2626 }
2627
2628 ec = rb_econv_open(source_encoding, destination_encoding, ecflags);
2629 if (!ec)
2630 return ec;
2631
2632 if (!NIL_P(replacement)) {
2633 int ret;
2634 rb_encoding *enc = rb_enc_get(replacement);
2635
2636 ret = rb_econv_set_replacement(ec,
2637 (const unsigned char *)RSTRING_PTR(replacement),
2638 RSTRING_LEN(replacement),
2639 rb_enc_name(enc));
2640 if (ret == -1) {
2641 rb_econv_close(ec);
2642 return NULL;
2643 }
2644 }
2645 return ec;
2646}
2647
2648static int
2649enc_arg(VALUE *arg, const char **name_p, rb_encoding **enc_p)
2650{
2651 rb_encoding *enc;
2652 const char *n;
2653 int encidx;
2654 VALUE encval;
2655
2656 if (((encidx = rb_to_encoding_index(encval = *arg)) < 0) ||
2657 !(enc = rb_enc_from_index(encidx))) {
2658 enc = NULL;
2659 encidx = 0;
2660 n = StringValueCStr(*arg);
2661 }
2662 else {
2663 n = rb_enc_name(enc);
2664 }
2665
2666 *name_p = n;
2667 *enc_p = enc;
2668
2669 return encidx;
2670}
2671
2672static int
2673str_transcode_enc_args(VALUE str, VALUE *arg1, VALUE *arg2,
2674 const char **sname_p, rb_encoding **senc_p,
2675 const char **dname_p, rb_encoding **denc_p)
2676{
2677 rb_encoding *senc, *denc;
2678 const char *sname, *dname;
2679 int sencidx, dencidx;
2680
2681 dencidx = enc_arg(arg1, &dname, &denc);
2682
2683 if (NIL_P(*arg2)) {
2684 sencidx = rb_enc_get_index(str);
2685 senc = rb_enc_from_index(sencidx);
2686 sname = rb_enc_name(senc);
2687 }
2688 else {
2689 sencidx = enc_arg(arg2, &sname, &senc);
2690 }
2691
2692 *sname_p = sname;
2693 *senc_p = senc;
2694 *dname_p = dname;
2695 *denc_p = denc;
2696 return dencidx;
2697}
2698
2699static int
2700str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts)
2701{
2702 VALUE dest;
2703 VALUE str = *self;
2704 VALUE arg1, arg2;
2705 long blen, slen;
2706 unsigned char *buf, *bp, *sp;
2707 const unsigned char *fromp;
2708 rb_encoding *senc, *denc;
2709 const char *sname, *dname;
2710 int dencidx;
2711 int explicitly_invalid_replace = TRUE;
2712
2713 rb_check_arity(argc, 0, 2);
2714
2715 if (argc == 0) {
2716 arg1 = rb_enc_default_internal();
2717 if (NIL_P(arg1)) {
2718 if (!ecflags) return -1;
2719 arg1 = rb_obj_encoding(str);
2720 }
2721 if (!(ecflags & ECONV_INVALID_MASK)) {
2722 explicitly_invalid_replace = FALSE;
2723 }
2725 }
2726 else {
2727 arg1 = argv[0];
2728 }
2729 arg2 = argc<=1 ? Qnil : argv[1];
2730 dencidx = str_transcode_enc_args(str, &arg1, &arg2, &sname, &senc, &dname, &denc);
2731
2732 if ((ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
2736 if (senc && senc == denc) {
2737 if ((ecflags & ECONV_INVALID_MASK) && explicitly_invalid_replace) {
2738 VALUE rep = Qnil;
2739 if (!NIL_P(ecopts)) {
2740 rep = rb_hash_aref(ecopts, sym_replace);
2741 }
2742 dest = rb_enc_str_scrub(senc, str, rep);
2743 if (NIL_P(dest)) dest = str;
2744 *self = dest;
2745 return dencidx;
2746 }
2747 return NIL_P(arg2) ? -1 : dencidx;
2748 }
2749 if (senc && denc && rb_enc_asciicompat(senc) && rb_enc_asciicompat(denc)) {
2751 return dencidx;
2752 }
2753 }
2754 if (encoding_equal(sname, dname)) {
2755 return NIL_P(arg2) ? -1 : dencidx;
2756 }
2757 }
2758 else {
2759 if (senc && denc && !rb_enc_asciicompat(senc) && !rb_enc_asciicompat(denc)) {
2760 rb_encoding *utf8 = rb_utf8_encoding();
2761 str = rb_str_conv_enc(str, senc, utf8);
2762 senc = utf8;
2763 sname = "UTF-8";
2764 }
2765 if (encoding_equal(sname, dname)) {
2766 sname = "";
2767 dname = "";
2768 }
2769 }
2770
2771 fromp = sp = (unsigned char *)RSTRING_PTR(str);
2772 slen = RSTRING_LEN(str);
2773 blen = slen + 30; /* len + margin */
2774 dest = rb_str_tmp_new(blen);
2775 bp = (unsigned char *)RSTRING_PTR(dest);
2776
2777 transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), dest, str_transcoding_resize, sname, dname, ecflags, ecopts);
2778 if (fromp != sp+slen) {
2779 rb_raise(rb_eArgError, "not fully converted, %"PRIdPTRDIFF" bytes left", sp+slen-fromp);
2780 }
2781 buf = (unsigned char *)RSTRING_PTR(dest);
2782 *bp = '\0';
2783 rb_str_set_len(dest, bp - buf);
2784
2785 /* set encoding */
2786 if (!denc) {
2787 dencidx = rb_define_dummy_encoding(dname);
2788 RB_GC_GUARD(arg1);
2789 RB_GC_GUARD(arg2);
2790 }
2791 *self = dest;
2792
2793 return dencidx;
2794}
2795
2796static int
2797str_transcode(int argc, VALUE *argv, VALUE *self)
2798{
2799 VALUE opt;
2800 int ecflags = 0;
2801 VALUE ecopts = Qnil;
2802
2803 argc = rb_scan_args(argc, argv, "02:", NULL, NULL, &opt);
2804 if (!NIL_P(opt)) {
2805 ecflags = rb_econv_prepare_opts(opt, &ecopts);
2806 }
2807 return str_transcode0(argc, argv, self, ecflags, ecopts);
2808}
2809
2810static inline VALUE
2811str_encode_associate(VALUE str, int encidx)
2812{
2813 int cr = 0;
2814
2815 rb_enc_associate_index(str, encidx);
2816
2817 /* transcoded string never be broken. */
2820 }
2821 else {
2823 }
2824 ENC_CODERANGE_SET(str, cr);
2825 return str;
2826}
2827
2828/*
2829 * call-seq:
2830 * str.encode!(encoding, **options) -> str
2831 * str.encode!(dst_encoding, src_encoding, **options) -> str
2832 *
2833 * The first form transcodes the contents of <i>str</i> from
2834 * str.encoding to +encoding+.
2835 * The second form transcodes the contents of <i>str</i> from
2836 * src_encoding to dst_encoding.
2837 * The +options+ keyword arguments give details for conversion. See String#encode
2838 * for details.
2839 * Returns the string even if no changes were made.
2840 */
2841
2842static VALUE
2843str_encode_bang(int argc, VALUE *argv, VALUE str)
2844{
2845 VALUE newstr;
2846 int encidx;
2847
2848 rb_check_frozen(str);
2849
2850 newstr = str;
2851 encidx = str_transcode(argc, argv, &newstr);
2852
2853 if (encidx < 0) return str;
2854 if (newstr == str) {
2855 rb_enc_associate_index(str, encidx);
2856 return str;
2857 }
2858 rb_str_shared_replace(str, newstr);
2859 return str_encode_associate(str, encidx);
2860}
2861
2862static VALUE encoded_dup(VALUE newstr, VALUE str, int encidx);
2863
2864/*
2865 * call-seq:
2866 * str.encode(encoding, **options) -> str
2867 * str.encode(dst_encoding, src_encoding, **options) -> str
2868 * str.encode(**options) -> str
2869 *
2870 * The first form returns a copy of +str+ transcoded
2871 * to encoding +encoding+.
2872 * The second form returns a copy of +str+ transcoded
2873 * from src_encoding to dst_encoding.
2874 * The last form returns a copy of +str+ transcoded to
2875 * <tt>Encoding.default_internal</tt>.
2876 *
2877 * By default, the first and second form raise
2878 * Encoding::UndefinedConversionError for characters that are
2879 * undefined in the destination encoding, and
2880 * Encoding::InvalidByteSequenceError for invalid byte sequences
2881 * in the source encoding. The last form by default does not raise
2882 * exceptions but uses replacement strings.
2883 *
2884 * The +options+ keyword arguments give details for conversion.
2885 * The arguments are:
2886 *
2887 * :invalid ::
2888 * If the value is +:replace+, #encode replaces invalid byte sequences in
2889 * +str+ with the replacement character. The default is to raise the
2890 * Encoding::InvalidByteSequenceError exception
2891 * :undef ::
2892 * If the value is +:replace+, #encode replaces characters which are
2893 * undefined in the destination encoding with the replacement character.
2894 * The default is to raise the Encoding::UndefinedConversionError.
2895 * :replace ::
2896 * Sets the replacement string to the given value. The default replacement
2897 * string is "\uFFFD" for Unicode encoding forms, and "?" otherwise.
2898 * :fallback ::
2899 * Sets the replacement string by the given object for undefined
2900 * character. The object should be a Hash, a Proc, a Method, or an
2901 * object which has [] method.
2902 * Its key is an undefined character encoded in the source encoding
2903 * of current transcoder. Its value can be any encoding until it
2904 * can be converted into the destination encoding of the transcoder.
2905 * :xml ::
2906 * The value must be +:text+ or +:attr+.
2907 * If the value is +:text+ #encode replaces undefined characters with their
2908 * (upper-case hexadecimal) numeric character references. '&', '<', and '>'
2909 * are converted to "&amp;", "&lt;", and "&gt;", respectively.
2910 * If the value is +:attr+, #encode also quotes the replacement result
2911 * (using '"'), and replaces '"' with "&quot;".
2912 * :cr_newline ::
2913 * Replaces LF ("\n") with CR ("\r") if value is true.
2914 * :crlf_newline ::
2915 * Replaces LF ("\n") with CRLF ("\r\n") if value is true.
2916 * :universal_newline ::
2917 * Replaces CRLF ("\r\n") and CR ("\r") with LF ("\n") if value is true.
2918 */
2919
2920static VALUE
2921str_encode(int argc, VALUE *argv, VALUE str)
2922{
2923 VALUE newstr = str;
2924 int encidx = str_transcode(argc, argv, &newstr);
2925 return encoded_dup(newstr, str, encidx);
2926}
2927
2928VALUE
2929rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
2930{
2931 int argc = 1;
2932 VALUE *argv = &to;
2933 VALUE newstr = str;
2934 int encidx = str_transcode0(argc, argv, &newstr, ecflags, ecopts);
2935 return encoded_dup(newstr, str, encidx);
2936}
2937
2938static VALUE
2939encoded_dup(VALUE newstr, VALUE str, int encidx)
2940{
2941 if (encidx < 0) return rb_str_dup(str);
2942 if (newstr == str) {
2943 newstr = rb_str_dup(str);
2944 rb_enc_associate_index(newstr, encidx);
2945 return newstr;
2946 }
2947 else {
2948 RBASIC_SET_CLASS(newstr, rb_obj_class(str));
2949 }
2950 return str_encode_associate(newstr, encidx);
2951}
2952
2953/*
2954 * Document-class: Encoding::Converter
2955 *
2956 * Encoding conversion class.
2957 */
2958static void
2959econv_free(void *ptr)
2960{
2961 rb_econv_t *ec = ptr;
2962 rb_econv_close(ec);
2963}
2964
2965static size_t
2966econv_memsize(const void *ptr)
2967{
2968 return sizeof(rb_econv_t);
2969}
2970
2971static const rb_data_type_t econv_data_type = {
2972 "econv",
2973 {0, econv_free, econv_memsize,},
2974 0, 0, RUBY_TYPED_FREE_IMMEDIATELY
2975};
2976
2977static VALUE
2978econv_s_allocate(VALUE klass)
2979{
2980 return TypedData_Wrap_Struct(klass, &econv_data_type, NULL);
2981}
2982
2983static rb_encoding *
2984make_dummy_encoding(const char *name)
2985{
2986 rb_encoding *enc;
2987 int idx;
2988 idx = rb_define_dummy_encoding(name);
2989 enc = rb_enc_from_index(idx);
2990 return enc;
2991}
2992
2993static rb_encoding *
2994make_encoding(const char *name)
2995{
2996 rb_encoding *enc;
2997 enc = rb_enc_find(name);
2998 if (!enc)
2999 enc = make_dummy_encoding(name);
3000 return enc;
3001}
3002
3003static VALUE
3004make_encobj(const char *name)
3005{
3006 return rb_enc_from_encoding(make_encoding(name));
3007}
3008
3009/*
3010 * call-seq:
3011 * Encoding::Converter.asciicompat_encoding(string) -> encoding or nil
3012 * Encoding::Converter.asciicompat_encoding(encoding) -> encoding or nil
3013 *
3014 * Returns the corresponding ASCII compatible encoding.
3015 *
3016 * Returns nil if the argument is an ASCII compatible encoding.
3017 *
3018 * "corresponding ASCII compatible encoding" is an ASCII compatible encoding which
3019 * can represents exactly the same characters as the given ASCII incompatible encoding.
3020 * So, no conversion undefined error occurs when converting between the two encodings.
3021 *
3022 * Encoding::Converter.asciicompat_encoding("ISO-2022-JP") #=> #<Encoding:stateless-ISO-2022-JP>
3023 * Encoding::Converter.asciicompat_encoding("UTF-16BE") #=> #<Encoding:UTF-8>
3024 * Encoding::Converter.asciicompat_encoding("UTF-8") #=> nil
3025 *
3026 */
3027static VALUE
3028econv_s_asciicompat_encoding(VALUE klass, VALUE arg)
3029{
3030 const char *arg_name, *result_name;
3031 rb_encoding *arg_enc, *result_enc;
3032
3033 enc_arg(&arg, &arg_name, &arg_enc);
3034
3035 result_name = rb_econv_asciicompat_encoding(arg_name);
3036
3037 if (result_name == NULL)
3038 return Qnil;
3039
3040 result_enc = make_encoding(result_name);
3041
3042 return rb_enc_from_encoding(result_enc);
3043}
3044
3045static void
3046econv_args(int argc, VALUE *argv,
3047 VALUE *snamev_p, VALUE *dnamev_p,
3048 const char **sname_p, const char **dname_p,
3049 rb_encoding **senc_p, rb_encoding **denc_p,
3050 int *ecflags_p,
3051 VALUE *ecopts_p)
3052{
3053 VALUE opt, flags_v, ecopts;
3054 int sidx, didx;
3055 const char *sname, *dname;
3056 rb_encoding *senc, *denc;
3057 int ecflags;
3058
3059 argc = rb_scan_args(argc, argv, "21:", snamev_p, dnamev_p, &flags_v, &opt);
3060
3061 if (!NIL_P(flags_v)) {
3062 if (!NIL_P(opt)) {
3063 rb_error_arity(argc + 1, 2, 3);
3064 }
3065 ecflags = NUM2INT(rb_to_int(flags_v));
3066 ecopts = Qnil;
3067 }
3068 else if (!NIL_P(opt)) {
3069 ecflags = rb_econv_prepare_opts(opt, &ecopts);
3070 }
3071 else {
3072 ecflags = 0;
3073 ecopts = Qnil;
3074 }
3075
3076 senc = NULL;
3077 sidx = rb_to_encoding_index(*snamev_p);
3078 if (0 <= sidx) {
3079 senc = rb_enc_from_index(sidx);
3080 }
3081 else {
3082 StringValue(*snamev_p);
3083 }
3084
3085 denc = NULL;
3086 didx = rb_to_encoding_index(*dnamev_p);
3087 if (0 <= didx) {
3088 denc = rb_enc_from_index(didx);
3089 }
3090 else {
3091 StringValue(*dnamev_p);
3092 }
3093
3094 sname = senc ? rb_enc_name(senc) : StringValueCStr(*snamev_p);
3095 dname = denc ? rb_enc_name(denc) : StringValueCStr(*dnamev_p);
3096
3097 *sname_p = sname;
3098 *dname_p = dname;
3099 *senc_p = senc;
3100 *denc_p = denc;
3101 *ecflags_p = ecflags;
3102 *ecopts_p = ecopts;
3103}
3104
3105static int
3106decorate_convpath(VALUE convpath, int ecflags)
3107{
3108 int num_decorators;
3109 const char *decorators[MAX_ECFLAGS_DECORATORS];
3110 int i;
3111 int n, len;
3112
3113 num_decorators = decorator_names(ecflags, decorators);
3114 if (num_decorators == -1)
3115 return -1;
3116
3117 len = n = RARRAY_LENINT(convpath);
3118 if (n != 0) {
3119 VALUE pair = RARRAY_AREF(convpath, n-1);
3120 if (RB_TYPE_P(pair, T_ARRAY)) {
3121 const char *sname = rb_enc_name(rb_to_encoding(RARRAY_AREF(pair, 0)));
3122 const char *dname = rb_enc_name(rb_to_encoding(RARRAY_AREF(pair, 1)));
3123 transcoder_entry_t *entry = get_transcoder_entry(sname, dname);
3124 const rb_transcoder *tr = load_transcoder_entry(entry);
3125 if (!tr)
3126 return -1;
3127 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
3128 tr->asciicompat_type == asciicompat_encoder) {
3129 n--;
3130 rb_ary_store(convpath, len + num_decorators - 1, pair);
3131 }
3132 }
3133 else {
3134 rb_ary_store(convpath, len + num_decorators - 1, pair);
3135 }
3136 }
3137
3138 for (i = 0; i < num_decorators; i++)
3139 rb_ary_store(convpath, n + i, rb_str_new_cstr(decorators[i]));
3140
3141 return 0;
3142}
3143
3144static void
3145search_convpath_i(const char *sname, const char *dname, int depth, void *arg)
3146{
3147 VALUE *ary_p = arg;
3148 VALUE v;
3149
3150 if (NIL_P(*ary_p)) {
3151 *ary_p = rb_ary_new();
3152 }
3153
3154 if (DECORATOR_P(sname, dname)) {
3155 v = rb_str_new_cstr(dname);
3156 }
3157 else {
3158 v = rb_assoc_new(make_encobj(sname), make_encobj(dname));
3159 }
3160 rb_ary_store(*ary_p, depth, v);
3161}
3162
3163/*
3164 * call-seq:
3165 * Encoding::Converter.search_convpath(source_encoding, destination_encoding) -> ary
3166 * Encoding::Converter.search_convpath(source_encoding, destination_encoding, opt) -> ary
3167 *
3168 * Returns a conversion path.
3169 *
3170 * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP")
3171 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3172 * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>]]
3173 *
3174 * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", universal_newline: true)
3175 * or
3176 * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", newline: :universal)
3177 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3178 * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
3179 * # "universal_newline"]
3180 *
3181 * p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", universal_newline: true)
3182 * or
3183 * p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", newline: :universal)
3184 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3185 * # "universal_newline",
3186 * # [#<Encoding:UTF-8>, #<Encoding:UTF-32BE>]]
3187 */
3188static VALUE
3189econv_s_search_convpath(int argc, VALUE *argv, VALUE klass)
3190{
3191 VALUE snamev, dnamev;
3192 const char *sname, *dname;
3193 rb_encoding *senc, *denc;
3194 int ecflags;
3195 VALUE ecopts;
3196 VALUE convpath;
3197
3198 econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
3199
3200 convpath = Qnil;
3201 transcode_search_path(sname, dname, search_convpath_i, &convpath);
3202
3203 if (NIL_P(convpath)) {
3204 VALUE exc = rb_econv_open_exc(sname, dname, ecflags);
3205 RB_GC_GUARD(snamev);
3206 RB_GC_GUARD(dnamev);
3207 rb_exc_raise(exc);
3208 }
3209
3210 if (decorate_convpath(convpath, ecflags) == -1) {
3211 VALUE exc = rb_econv_open_exc(sname, dname, ecflags);
3212 RB_GC_GUARD(snamev);
3213 RB_GC_GUARD(dnamev);
3214 rb_exc_raise(exc);
3215 }
3216
3217 return convpath;
3218}
3219
3220/*
3221 * Check the existence of a conversion path.
3222 * Returns the number of converters in the conversion path.
3223 * result: >=0:success -1:failure
3224 */
3225int
3226rb_econv_has_convpath_p(const char* from_encoding, const char* to_encoding)
3227{
3228 VALUE convpath = Qnil;
3229 transcode_search_path(from_encoding, to_encoding, search_convpath_i,
3230 &convpath);
3231 return RTEST(convpath);
3232}
3233
3235 rb_econv_t *ec;
3236 int index;
3237 int ret;
3238};
3239
3240static void
3241rb_econv_init_by_convpath_i(const char *sname, const char *dname, int depth, void *arg)
3242{
3244 int ret;
3245
3246 if (a->ret == -1)
3247 return;
3248
3249 ret = rb_econv_add_converter(a->ec, sname, dname, a->index);
3250
3251 a->ret = ret;
3252 return;
3253}
3254
3255static rb_econv_t *
3256rb_econv_init_by_convpath(VALUE self, VALUE convpath,
3257 const char **sname_p, const char **dname_p,
3258 rb_encoding **senc_p, rb_encoding**denc_p)
3259{
3260 rb_econv_t *ec;
3261 long i;
3262 int ret, first=1;
3263 VALUE elt;
3264 rb_encoding *senc = 0, *denc = 0;
3265 const char *sname, *dname;
3266
3267 ec = rb_econv_alloc(RARRAY_LENINT(convpath));
3268 DATA_PTR(self) = ec;
3269
3270 for (i = 0; i < RARRAY_LEN(convpath); i++) {
3271 VALUE snamev, dnamev;
3272 VALUE pair;
3273 elt = rb_ary_entry(convpath, i);
3274 if (!NIL_P(pair = rb_check_array_type(elt))) {
3275 if (RARRAY_LEN(pair) != 2)
3276 rb_raise(rb_eArgError, "not a 2-element array in convpath");
3277 snamev = rb_ary_entry(pair, 0);
3278 enc_arg(&snamev, &sname, &senc);
3279 dnamev = rb_ary_entry(pair, 1);
3280 enc_arg(&dnamev, &dname, &denc);
3281 }
3282 else {
3283 sname = "";
3284 dname = StringValueCStr(elt);
3285 }
3286 if (DECORATOR_P(sname, dname)) {
3287 ret = rb_econv_add_converter(ec, sname, dname, ec->num_trans);
3288 if (ret == -1) {
3289 VALUE msg = rb_sprintf("decoration failed: %s", dname);
3290 RB_GC_GUARD(snamev);
3291 RB_GC_GUARD(dnamev);
3292 rb_exc_raise(rb_exc_new_str(rb_eArgError, msg));
3293 }
3294 }
3295 else {
3296 int j = ec->num_trans;
3297 struct rb_econv_init_by_convpath_t arg;
3298 arg.ec = ec;
3299 arg.index = ec->num_trans;
3300 arg.ret = 0;
3301 ret = transcode_search_path(sname, dname, rb_econv_init_by_convpath_i, &arg);
3302 if (ret == -1 || arg.ret == -1) {
3303 VALUE msg = rb_sprintf("adding conversion failed: %s to %s", sname, dname);
3304 RB_GC_GUARD(snamev);
3305 RB_GC_GUARD(dnamev);
3306 rb_exc_raise(rb_exc_new_str(rb_eArgError, msg));
3307 }
3308 if (first) {
3309 first = 0;
3310 *senc_p = senc;
3311 *sname_p = ec->elems[j].tc->transcoder->src_encoding;
3312 }
3313 *denc_p = denc;
3314 *dname_p = ec->elems[ec->num_trans-1].tc->transcoder->dst_encoding;
3315 }
3316 }
3317
3318 if (first) {
3319 *senc_p = NULL;
3320 *denc_p = NULL;
3321 *sname_p = "";
3322 *dname_p = "";
3323 }
3324
3325 ec->source_encoding_name = *sname_p;
3326 ec->destination_encoding_name = *dname_p;
3327
3328 return ec;
3329}
3330
3331/*
3332 * call-seq:
3333 * Encoding::Converter.new(source_encoding, destination_encoding)
3334 * Encoding::Converter.new(source_encoding, destination_encoding, opt)
3335 * Encoding::Converter.new(convpath)
3336 *
3337 * possible options elements:
3338 * hash form:
3339 * :invalid => nil # raise error on invalid byte sequence (default)
3340 * :invalid => :replace # replace invalid byte sequence
3341 * :undef => nil # raise error on undefined conversion (default)
3342 * :undef => :replace # replace undefined conversion
3343 * :replace => string # replacement string ("?" or "\uFFFD" if not specified)
3344 * :newline => :universal # decorator for converting CRLF and CR to LF
3345 * :newline => :crlf # decorator for converting LF to CRLF
3346 * :newline => :cr # decorator for converting LF to CR
3347 * :universal_newline => true # decorator for converting CRLF and CR to LF
3348 * :crlf_newline => true # decorator for converting LF to CRLF
3349 * :cr_newline => true # decorator for converting LF to CR
3350 * :xml => :text # escape as XML CharData.
3351 * :xml => :attr # escape as XML AttValue
3352 * integer form:
3353 * Encoding::Converter::INVALID_REPLACE
3354 * Encoding::Converter::UNDEF_REPLACE
3355 * Encoding::Converter::UNDEF_HEX_CHARREF
3356 * Encoding::Converter::UNIVERSAL_NEWLINE_DECORATOR
3357 * Encoding::Converter::CRLF_NEWLINE_DECORATOR
3358 * Encoding::Converter::CR_NEWLINE_DECORATOR
3359 * Encoding::Converter::XML_TEXT_DECORATOR
3360 * Encoding::Converter::XML_ATTR_CONTENT_DECORATOR
3361 * Encoding::Converter::XML_ATTR_QUOTE_DECORATOR
3362 *
3363 * Encoding::Converter.new creates an instance of Encoding::Converter.
3364 *
3365 * Source_encoding and destination_encoding should be a string or
3366 * Encoding object.
3367 *
3368 * opt should be nil, a hash or an integer.
3369 *
3370 * convpath should be an array.
3371 * convpath may contain
3372 * - two-element arrays which contain encodings or encoding names, or
3373 * - strings representing decorator names.
3374 *
3375 * Encoding::Converter.new optionally takes an option.
3376 * The option should be a hash or an integer.
3377 * The option hash can contain :invalid => nil, etc.
3378 * The option integer should be logical-or of constants such as
3379 * Encoding::Converter::INVALID_REPLACE, etc.
3380 *
3381 * [:invalid => nil]
3382 * Raise error on invalid byte sequence. This is a default behavior.
3383 * [:invalid => :replace]
3384 * Replace invalid byte sequence by replacement string.
3385 * [:undef => nil]
3386 * Raise an error if a character in source_encoding is not defined in destination_encoding.
3387 * This is a default behavior.
3388 * [:undef => :replace]
3389 * Replace undefined character in destination_encoding with replacement string.
3390 * [:replace => string]
3391 * Specify the replacement string.
3392 * If not specified, "\uFFFD" is used for Unicode encodings and "?" for others.
3393 * [:universal_newline => true]
3394 * Convert CRLF and CR to LF.
3395 * [:crlf_newline => true]
3396 * Convert LF to CRLF.
3397 * [:cr_newline => true]
3398 * Convert LF to CR.
3399 * [:xml => :text]
3400 * Escape as XML CharData.
3401 * This form can be used as an HTML 4.0 #PCDATA.
3402 * - '&' -> '&amp;'
3403 * - '<' -> '&lt;'
3404 * - '>' -> '&gt;'
3405 * - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH;
3406 * [:xml => :attr]
3407 * Escape as XML AttValue.
3408 * The converted result is quoted as "...".
3409 * This form can be used as an HTML 4.0 attribute value.
3410 * - '&' -> '&amp;'
3411 * - '<' -> '&lt;'
3412 * - '>' -> '&gt;'
3413 * - '"' -> '&quot;'
3414 * - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH;
3415 *
3416 * Examples:
3417 * # UTF-16BE to UTF-8
3418 * ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
3419 *
3420 * # Usually, decorators such as newline conversion are inserted last.
3421 * ec = Encoding::Converter.new("UTF-16BE", "UTF-8", :universal_newline => true)
3422 * p ec.convpath #=> [[#<Encoding:UTF-16BE>, #<Encoding:UTF-8>],
3423 * # "universal_newline"]
3424 *
3425 * # But, if the last encoding is ASCII incompatible,
3426 * # decorators are inserted before the last conversion.
3427 * ec = Encoding::Converter.new("UTF-8", "UTF-16BE", :crlf_newline => true)
3428 * p ec.convpath #=> ["crlf_newline",
3429 * # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
3430 *
3431 * # Conversion path can be specified directly.
3432 * ec = Encoding::Converter.new(["universal_newline", ["EUC-JP", "UTF-8"], ["UTF-8", "UTF-16BE"]])
3433 * p ec.convpath #=> ["universal_newline",
3434 * # [#<Encoding:EUC-JP>, #<Encoding:UTF-8>],
3435 * # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
3436 */
3437static VALUE
3438econv_init(int argc, VALUE *argv, VALUE self)
3439{
3440 VALUE ecopts;
3441 VALUE snamev, dnamev;
3442 const char *sname, *dname;
3443 rb_encoding *senc, *denc;
3444 rb_econv_t *ec;
3445 int ecflags;
3446 VALUE convpath;
3447
3448 if (rb_check_typeddata(self, &econv_data_type)) {
3449 rb_raise(rb_eTypeError, "already initialized");
3450 }
3451
3452 if (argc == 1 && !NIL_P(convpath = rb_check_array_type(argv[0]))) {
3453 ec = rb_econv_init_by_convpath(self, convpath, &sname, &dname, &senc, &denc);
3454 ecflags = 0;
3455 ecopts = Qnil;
3456 }
3457 else {
3458 econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
3459 ec = rb_econv_open_opts(sname, dname, ecflags, ecopts);
3460 }
3461
3462 if (!ec) {
3463 VALUE exc = rb_econv_open_exc(sname, dname, ecflags);
3464 RB_GC_GUARD(snamev);
3465 RB_GC_GUARD(dnamev);
3466 rb_exc_raise(exc);
3467 }
3468
3469 if (!DECORATOR_P(sname, dname)) {
3470 if (!senc)
3471 senc = make_dummy_encoding(sname);
3472 if (!denc)
3473 denc = make_dummy_encoding(dname);
3474 RB_GC_GUARD(snamev);
3475 RB_GC_GUARD(dnamev);
3476 }
3477
3478 ec->source_encoding = senc;
3479 ec->destination_encoding = denc;
3480
3481 DATA_PTR(self) = ec;
3482
3483 return self;
3484}
3485
3486/*
3487 * call-seq:
3488 * ec.inspect -> string
3489 *
3490 * Returns a printable version of <i>ec</i>
3491 *
3492 * ec = Encoding::Converter.new("iso-8859-1", "utf-8")
3493 * puts ec.inspect #=> #<Encoding::Converter: ISO-8859-1 to UTF-8>
3494 *
3495 */
3496static VALUE
3497econv_inspect(VALUE self)
3498{
3499 const char *cname = rb_obj_classname(self);
3500 rb_econv_t *ec;
3501
3502 TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
3503 if (!ec)
3504 return rb_sprintf("#<%s: uninitialized>", cname);
3505 else {
3506 const char *sname = ec->source_encoding_name;
3507 const char *dname = ec->destination_encoding_name;
3508 VALUE str;
3509 str = rb_sprintf("#<%s: ", cname);
3510 econv_description(sname, dname, ec->flags, str);
3511 rb_str_cat2(str, ">");
3512 return str;
3513 }
3514}
3515
3516static rb_econv_t *
3517check_econv(VALUE self)
3518{
3519 rb_econv_t *ec;
3520
3521 TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
3522 if (!ec) {
3523 rb_raise(rb_eTypeError, "uninitialized encoding converter");
3524 }
3525 return ec;
3526}
3527
3528/*
3529 * call-seq:
3530 * ec.source_encoding -> encoding
3531 *
3532 * Returns the source encoding as an Encoding object.
3533 */
3534static VALUE
3535econv_source_encoding(VALUE self)
3536{
3537 rb_econv_t *ec = check_econv(self);
3538 if (!ec->source_encoding)
3539 return Qnil;
3540 return rb_enc_from_encoding(ec->source_encoding);
3541}
3542
3543/*
3544 * call-seq:
3545 * ec.destination_encoding -> encoding
3546 *
3547 * Returns the destination encoding as an Encoding object.
3548 */
3549static VALUE
3550econv_destination_encoding(VALUE self)
3551{
3552 rb_econv_t *ec = check_econv(self);
3553 if (!ec->destination_encoding)
3554 return Qnil;
3555 return rb_enc_from_encoding(ec->destination_encoding);
3556}
3557
3558/*
3559 * call-seq:
3560 * ec.convpath -> ary
3561 *
3562 * Returns the conversion path of ec.
3563 *
3564 * The result is an array of conversions.
3565 *
3566 * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP", crlf_newline: true)
3567 * p ec.convpath
3568 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3569 * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
3570 * # "crlf_newline"]
3571 *
3572 * Each element of the array is a pair of encodings or a string.
3573 * A pair means an encoding conversion.
3574 * A string means a decorator.
3575 *
3576 * In the above example, [#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>] means
3577 * a converter from ISO-8859-1 to UTF-8.
3578 * "crlf_newline" means newline converter from LF to CRLF.
3579 */
3580static VALUE
3581econv_convpath(VALUE self)
3582{
3583 rb_econv_t *ec = check_econv(self);
3584 VALUE result;
3585 int i;
3586
3587 result = rb_ary_new();
3588 for (i = 0; i < ec->num_trans; i++) {
3589 const rb_transcoder *tr = ec->elems[i].tc->transcoder;
3590 VALUE v;
3591 if (DECORATOR_P(tr->src_encoding, tr->dst_encoding))
3592 v = rb_str_new_cstr(tr->dst_encoding);
3593 else
3594 v = rb_assoc_new(make_encobj(tr->src_encoding), make_encobj(tr->dst_encoding));
3595 rb_ary_push(result, v);
3596 }
3597 return result;
3598}
3599
3600/*
3601 * call-seq:
3602 * ec == other -> true or false
3603 */
3604static VALUE
3605econv_equal(VALUE self, VALUE other)
3606{
3607 rb_econv_t *ec1 = check_econv(self);
3608 rb_econv_t *ec2;
3609 int i;
3610
3611 if (!rb_typeddata_is_kind_of(other, &econv_data_type)) {
3612 return Qnil;
3613 }
3614 ec2 = DATA_PTR(other);
3615 if (!ec2) return Qfalse;
3616 if (ec1->source_encoding_name != ec2->source_encoding_name &&
3617 strcmp(ec1->source_encoding_name, ec2->source_encoding_name))
3618 return Qfalse;
3619 if (ec1->destination_encoding_name != ec2->destination_encoding_name &&
3620 strcmp(ec1->destination_encoding_name, ec2->destination_encoding_name))
3621 return Qfalse;
3622 if (ec1->flags != ec2->flags) return Qfalse;
3623 if (ec1->replacement_enc != ec2->replacement_enc &&
3624 strcmp(ec1->replacement_enc, ec2->replacement_enc))
3625 return Qfalse;
3626 if (ec1->replacement_len != ec2->replacement_len) return Qfalse;
3627 if (ec1->replacement_str != ec2->replacement_str &&
3628 memcmp(ec1->replacement_str, ec2->replacement_str, ec2->replacement_len))
3629 return Qfalse;
3630
3631 if (ec1->num_trans != ec2->num_trans) return Qfalse;
3632 for (i = 0; i < ec1->num_trans; i++) {
3633 if (ec1->elems[i].tc->transcoder != ec2->elems[i].tc->transcoder)
3634 return Qfalse;
3635 }
3636 return Qtrue;
3637}
3638
3639static VALUE
3640econv_result_to_symbol(rb_econv_result_t res)
3641{
3642 switch (res) {
3643 case econv_invalid_byte_sequence: return sym_invalid_byte_sequence;
3644 case econv_incomplete_input: return sym_incomplete_input;
3645 case econv_undefined_conversion: return sym_undefined_conversion;
3646 case econv_destination_buffer_full: return sym_destination_buffer_full;
3647 case econv_source_buffer_empty: return sym_source_buffer_empty;
3648 case econv_finished: return sym_finished;
3649 case econv_after_output: return sym_after_output;
3650 default: return INT2NUM(res); /* should not be reached */
3651 }
3652}
3653
3654/*
3655 * call-seq:
3656 * ec.primitive_convert(source_buffer, destination_buffer) -> symbol
3657 * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset) -> symbol
3658 * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize) -> symbol
3659 * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize, opt) -> symbol
3660 *
3661 * possible opt elements:
3662 * hash form:
3663 * :partial_input => true # source buffer may be part of larger source
3664 * :after_output => true # stop conversion after output before input
3665 * integer form:
3666 * Encoding::Converter::PARTIAL_INPUT
3667 * Encoding::Converter::AFTER_OUTPUT
3668 *
3669 * possible results:
3670 * :invalid_byte_sequence
3671 * :incomplete_input
3672 * :undefined_conversion
3673 * :after_output
3674 * :destination_buffer_full
3675 * :source_buffer_empty
3676 * :finished
3677 *
3678 * primitive_convert converts source_buffer into destination_buffer.
3679 *
3680 * source_buffer should be a string or nil.
3681 * nil means an empty string.
3682 *
3683 * destination_buffer should be a string.
3684 *
3685 * destination_byteoffset should be an integer or nil.
3686 * nil means the end of destination_buffer.
3687 * If it is omitted, nil is assumed.
3688 *
3689 * destination_bytesize should be an integer or nil.
3690 * nil means unlimited.
3691 * If it is omitted, nil is assumed.
3692 *
3693 * opt should be nil, a hash or an integer.
3694 * nil means no flags.
3695 * If it is omitted, nil is assumed.
3696 *
3697 * primitive_convert converts the content of source_buffer from beginning
3698 * and store the result into destination_buffer.
3699 *
3700 * destination_byteoffset and destination_bytesize specify the region which
3701 * the converted result is stored.
3702 * destination_byteoffset specifies the start position in destination_buffer in bytes.
3703 * If destination_byteoffset is nil,
3704 * destination_buffer.bytesize is used for appending the result.
3705 * destination_bytesize specifies maximum number of bytes.
3706 * If destination_bytesize is nil,
3707 * destination size is unlimited.
3708 * After conversion, destination_buffer is resized to
3709 * destination_byteoffset + actually produced number of bytes.
3710 * Also destination_buffer's encoding is set to destination_encoding.
3711 *
3712 * primitive_convert drops the converted part of source_buffer.
3713 * the dropped part is converted in destination_buffer or
3714 * buffered in Encoding::Converter object.
3715 *
3716 * primitive_convert stops conversion when one of following condition met.
3717 * - invalid byte sequence found in source buffer (:invalid_byte_sequence)
3718 * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3719 * - unexpected end of source buffer (:incomplete_input)
3720 * this occur only when :partial_input is not specified.
3721 * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3722 * - character not representable in output encoding (:undefined_conversion)
3723 * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3724 * - after some output is generated, before input is done (:after_output)
3725 * this occur only when :after_output is specified.
3726 * - destination buffer is full (:destination_buffer_full)
3727 * this occur only when destination_bytesize is non-nil.
3728 * - source buffer is empty (:source_buffer_empty)
3729 * this occur only when :partial_input is specified.
3730 * - conversion is finished (:finished)
3731 *
3732 * example:
3733 * ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
3734 * ret = ec.primitive_convert(src="pi", dst="", nil, 100)
3735 * p [ret, src, dst] #=> [:finished, "", "\x00p\x00i"]
3736 *
3737 * ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
3738 * ret = ec.primitive_convert(src="pi", dst="", nil, 1)
3739 * p [ret, src, dst] #=> [:destination_buffer_full, "i", "\x00"]
3740 * ret = ec.primitive_convert(src, dst="", nil, 1)
3741 * p [ret, src, dst] #=> [:destination_buffer_full, "", "p"]
3742 * ret = ec.primitive_convert(src, dst="", nil, 1)
3743 * p [ret, src, dst] #=> [:destination_buffer_full, "", "\x00"]
3744 * ret = ec.primitive_convert(src, dst="", nil, 1)
3745 * p [ret, src, dst] #=> [:finished, "", "i"]
3746 *
3747 */
3748static VALUE
3749econv_primitive_convert(int argc, VALUE *argv, VALUE self)
3750{
3751 VALUE input, output, output_byteoffset_v, output_bytesize_v, opt, flags_v;
3752 rb_econv_t *ec = check_econv(self);
3753 rb_econv_result_t res;
3754 const unsigned char *ip, *is;
3755 unsigned char *op, *os;
3756 long output_byteoffset, output_bytesize;
3757 unsigned long output_byteend;
3758 int flags;
3759
3760 argc = rb_scan_args(argc, argv, "23:", &input, &output, &output_byteoffset_v, &output_bytesize_v, &flags_v, &opt);
3761
3762 if (NIL_P(output_byteoffset_v))
3763 output_byteoffset = 0; /* dummy */
3764 else
3765 output_byteoffset = NUM2LONG(output_byteoffset_v);
3766
3767 if (NIL_P(output_bytesize_v))
3768 output_bytesize = 0; /* dummy */
3769 else
3770 output_bytesize = NUM2LONG(output_bytesize_v);
3771
3772 if (!NIL_P(flags_v)) {
3773 if (!NIL_P(opt)) {
3774 rb_error_arity(argc + 1, 2, 5);
3775 }
3776 flags = NUM2INT(rb_to_int(flags_v));
3777 }
3778 else if (!NIL_P(opt)) {
3779 VALUE v;
3780 flags = 0;
3781 v = rb_hash_aref(opt, sym_partial_input);
3782 if (RTEST(v))
3783 flags |= ECONV_PARTIAL_INPUT;
3784 v = rb_hash_aref(opt, sym_after_output);
3785 if (RTEST(v))
3786 flags |= ECONV_AFTER_OUTPUT;
3787 }
3788 else {
3789 flags = 0;
3790 }
3791
3792 StringValue(output);
3793 if (!NIL_P(input))
3794 StringValue(input);
3795 rb_str_modify(output);
3796
3797 if (NIL_P(output_bytesize_v)) {
3798#if USE_RVARGC
3799 output_bytesize = rb_str_capacity(output);
3800#else
3801 output_bytesize = RSTRING_EMBED_LEN_MAX;
3802#endif
3803 if (!NIL_P(input) && output_bytesize < RSTRING_LEN(input))
3804 output_bytesize = RSTRING_LEN(input);
3805 }
3806
3807 retry:
3808
3809 if (NIL_P(output_byteoffset_v))
3810 output_byteoffset = RSTRING_LEN(output);
3811
3812 if (output_byteoffset < 0)
3813 rb_raise(rb_eArgError, "negative output_byteoffset");
3814
3815 if (RSTRING_LEN(output) < output_byteoffset)
3816 rb_raise(rb_eArgError, "output_byteoffset too big");
3817
3818 if (output_bytesize < 0)
3819 rb_raise(rb_eArgError, "negative output_bytesize");
3820
3821 output_byteend = (unsigned long)output_byteoffset +
3822 (unsigned long)output_bytesize;
3823
3824 if (output_byteend < (unsigned long)output_byteoffset ||
3825 LONG_MAX < output_byteend)
3826 rb_raise(rb_eArgError, "output_byteoffset+output_bytesize too big");
3827
3828 if (rb_str_capacity(output) < output_byteend)
3829 rb_str_resize(output, output_byteend);
3830
3831 if (NIL_P(input)) {
3832 ip = is = NULL;
3833 }
3834 else {
3835 ip = (const unsigned char *)RSTRING_PTR(input);
3836 is = ip + RSTRING_LEN(input);
3837 }
3838
3839 op = (unsigned char *)RSTRING_PTR(output) + output_byteoffset;
3840 os = op + output_bytesize;
3841
3842 res = rb_econv_convert(ec, &ip, is, &op, os, flags);
3843 rb_str_set_len(output, op-(unsigned char *)RSTRING_PTR(output));
3844 if (!NIL_P(input)) {
3845 rb_str_drop_bytes(input, ip - (unsigned char *)RSTRING_PTR(input));
3846 }
3847
3848 if (NIL_P(output_bytesize_v) && res == econv_destination_buffer_full) {
3849 if (LONG_MAX / 2 < output_bytesize)
3850 rb_raise(rb_eArgError, "too long conversion result");
3851 output_bytesize *= 2;
3852 output_byteoffset_v = Qnil;
3853 goto retry;
3854 }
3855
3856 if (ec->destination_encoding) {
3857 rb_enc_associate(output, ec->destination_encoding);
3858 }
3859
3860 return econv_result_to_symbol(res);
3861}
3862
3863/*
3864 * call-seq:
3865 * ec.convert(source_string) -> destination_string
3866 *
3867 * Convert source_string and return destination_string.
3868 *
3869 * source_string is assumed as a part of source.
3870 * i.e. :partial_input=>true is specified internally.
3871 * finish method should be used last.
3872 *
3873 * ec = Encoding::Converter.new("utf-8", "euc-jp")
3874 * puts ec.convert("\u3042").dump #=> "\xA4\xA2"
3875 * puts ec.finish.dump #=> ""
3876 *
3877 * ec = Encoding::Converter.new("euc-jp", "utf-8")
3878 * puts ec.convert("\xA4").dump #=> ""
3879 * puts ec.convert("\xA2").dump #=> "\xE3\x81\x82"
3880 * puts ec.finish.dump #=> ""
3881 *
3882 * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
3883 * puts ec.convert("\xE3").dump #=> "".force_encoding("ISO-2022-JP")
3884 * puts ec.convert("\x81").dump #=> "".force_encoding("ISO-2022-JP")
3885 * puts ec.convert("\x82").dump #=> "\e$B$\"".force_encoding("ISO-2022-JP")
3886 * puts ec.finish.dump #=> "\e(B".force_encoding("ISO-2022-JP")
3887 *
3888 * If a conversion error occur,
3889 * Encoding::UndefinedConversionError or
3890 * Encoding::InvalidByteSequenceError is raised.
3891 * Encoding::Converter#convert doesn't supply methods to recover or restart
3892 * from these exceptions.
3893 * When you want to handle these conversion errors,
3894 * use Encoding::Converter#primitive_convert.
3895 *
3896 */
3897static VALUE
3898econv_convert(VALUE self, VALUE source_string)
3899{
3900 VALUE ret, dst;
3901 VALUE av[5];
3902 int ac;
3903 rb_econv_t *ec = check_econv(self);
3904
3905 StringValue(source_string);
3906
3907 dst = rb_str_new(NULL, 0);
3908
3909 av[0] = rb_str_dup(source_string);
3910 av[1] = dst;
3911 av[2] = Qnil;
3912 av[3] = Qnil;
3914 ac = 5;
3915
3916 ret = econv_primitive_convert(ac, av, self);
3917
3918 if (ret == sym_invalid_byte_sequence ||
3919 ret == sym_undefined_conversion ||
3920 ret == sym_incomplete_input) {
3921 VALUE exc = make_econv_exception(ec);
3922 rb_exc_raise(exc);
3923 }
3924
3925 if (ret == sym_finished) {
3926 rb_raise(rb_eArgError, "converter already finished");
3927 }
3928
3929 if (ret != sym_source_buffer_empty) {
3930 rb_bug("unexpected result of econv_primitive_convert");
3931 }
3932
3933 return dst;
3934}
3935
3936/*
3937 * call-seq:
3938 * ec.finish -> string
3939 *
3940 * Finishes the converter.
3941 * It returns the last part of the converted string.
3942 *
3943 * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
3944 * p ec.convert("\u3042") #=> "\e$B$\""
3945 * p ec.finish #=> "\e(B"
3946 */
3947static VALUE
3948econv_finish(VALUE self)
3949{
3950 VALUE ret, dst;
3951 VALUE av[5];
3952 int ac;
3953 rb_econv_t *ec = check_econv(self);
3954
3955 dst = rb_str_new(NULL, 0);
3956
3957 av[0] = Qnil;
3958 av[1] = dst;
3959 av[2] = Qnil;
3960 av[3] = Qnil;
3961 av[4] = INT2FIX(0);
3962 ac = 5;
3963
3964 ret = econv_primitive_convert(ac, av, self);
3965
3966 if (ret == sym_invalid_byte_sequence ||
3967 ret == sym_undefined_conversion ||
3968 ret == sym_incomplete_input) {
3969 VALUE exc = make_econv_exception(ec);
3970 rb_exc_raise(exc);
3971 }
3972
3973 if (ret != sym_finished) {
3974 rb_bug("unexpected result of econv_primitive_convert");
3975 }
3976
3977 return dst;
3978}
3979
3980/*
3981 * call-seq:
3982 * ec.primitive_errinfo -> array
3983 *
3984 * primitive_errinfo returns important information regarding the last error
3985 * as a 5-element array:
3986 *
3987 * [result, enc1, enc2, error_bytes, readagain_bytes]
3988 *
3989 * result is the last result of primitive_convert.
3990 *
3991 * Other elements are only meaningful when result is
3992 * :invalid_byte_sequence, :incomplete_input or :undefined_conversion.
3993 *
3994 * enc1 and enc2 indicate a conversion step as a pair of strings.
3995 * For example, a converter from EUC-JP to ISO-8859-1 converts
3996 * a string as follows: EUC-JP -> UTF-8 -> ISO-8859-1.
3997 * So [enc1, enc2] is either ["EUC-JP", "UTF-8"] or ["UTF-8", "ISO-8859-1"].
3998 *
3999 * error_bytes and readagain_bytes indicate the byte sequences which caused the error.
4000 * error_bytes is discarded portion.
4001 * readagain_bytes is buffered portion which is read again on next conversion.
4002 *
4003 * Example:
4004 *
4005 * # \xff is invalid as EUC-JP.
4006 * ec = Encoding::Converter.new("EUC-JP", "Shift_JIS")
4007 * ec.primitive_convert(src="\xff", dst="", nil, 10)
4008 * p ec.primitive_errinfo
4009 * #=> [:invalid_byte_sequence, "EUC-JP", "Shift_JIS", "\xFF", ""]
4010 *
4011 * # HIRAGANA LETTER A (\xa4\xa2 in EUC-JP) is not representable in ISO-8859-1.
4012 * # Since this error is occur in UTF-8 to ISO-8859-1 conversion,
4013 * # error_bytes is HIRAGANA LETTER A in UTF-8 (\xE3\x81\x82).
4014 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4015 * ec.primitive_convert(src="\xa4\xa2", dst="", nil, 10)
4016 * p ec.primitive_errinfo
4017 * #=> [:undefined_conversion, "UTF-8", "ISO-8859-1", "\xE3\x81\x82", ""]
4018 *
4019 * # partial character is invalid
4020 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4021 * ec.primitive_convert(src="\xa4", dst="", nil, 10)
4022 * p ec.primitive_errinfo
4023 * #=> [:incomplete_input, "EUC-JP", "UTF-8", "\xA4", ""]
4024 *
4025 * # Encoding::Converter::PARTIAL_INPUT prevents invalid errors by
4026 * # partial characters.
4027 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4028 * ec.primitive_convert(src="\xa4", dst="", nil, 10, Encoding::Converter::PARTIAL_INPUT)
4029 * p ec.primitive_errinfo
4030 * #=> [:source_buffer_empty, nil, nil, nil, nil]
4031 *
4032 * # \xd8\x00\x00@ is invalid as UTF-16BE because
4033 * # no low surrogate after high surrogate (\xd8\x00).
4034 * # It is detected by 3rd byte (\00) which is part of next character.
4035 * # So the high surrogate (\xd8\x00) is discarded and
4036 * # the 3rd byte is read again later.
4037 * # Since the byte is buffered in ec, it is dropped from src.
4038 * ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
4039 * ec.primitive_convert(src="\xd8\x00\x00@", dst="", nil, 10)
4040 * p ec.primitive_errinfo
4041 * #=> [:invalid_byte_sequence, "UTF-16BE", "UTF-8", "\xD8\x00", "\x00"]
4042 * p src
4043 * #=> "@"
4044 *
4045 * # Similar to UTF-16BE, \x00\xd8@\x00 is invalid as UTF-16LE.
4046 * # The problem is detected by 4th byte.
4047 * ec = Encoding::Converter.new("UTF-16LE", "UTF-8")
4048 * ec.primitive_convert(src="\x00\xd8@\x00", dst="", nil, 10)
4049 * p ec.primitive_errinfo
4050 * #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "@\x00"]
4051 * p src
4052 * #=> ""
4053 *
4054 */
4055static VALUE
4056econv_primitive_errinfo(VALUE self)
4057{
4058 rb_econv_t *ec = check_econv(self);
4059
4060 VALUE ary;
4061
4062 ary = rb_ary_new2(5);
4063
4064 rb_ary_store(ary, 0, econv_result_to_symbol(ec->last_error.result));
4065 rb_ary_store(ary, 4, Qnil);
4066
4067 if (ec->last_error.source_encoding)
4068 rb_ary_store(ary, 1, rb_str_new2(ec->last_error.source_encoding));
4069
4070 if (ec->last_error.destination_encoding)
4071 rb_ary_store(ary, 2, rb_str_new2(ec->last_error.destination_encoding));
4072
4073 if (ec->last_error.error_bytes_start) {
4074 rb_ary_store(ary, 3, rb_str_new((const char *)ec->last_error.error_bytes_start, ec->last_error.error_bytes_len));
4075 rb_ary_store(ary, 4, rb_str_new((const char *)ec->last_error.error_bytes_start + ec->last_error.error_bytes_len, ec->last_error.readagain_len));
4076 }
4077
4078 return ary;
4079}
4080
4081/*
4082 * call-seq:
4083 * ec.insert_output(string) -> nil
4084 *
4085 * Inserts string into the encoding converter.
4086 * The string will be converted to the destination encoding and
4087 * output on later conversions.
4088 *
4089 * If the destination encoding is stateful,
4090 * string is converted according to the state and the state is updated.
4091 *
4092 * This method should be used only when a conversion error occurs.
4093 *
4094 * ec = Encoding::Converter.new("utf-8", "iso-8859-1")
4095 * src = "HIRAGANA LETTER A is \u{3042}."
4096 * dst = ""
4097 * p ec.primitive_convert(src, dst) #=> :undefined_conversion
4098 * puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is ", "."]
4099 * ec.insert_output("<err>")
4100 * p ec.primitive_convert(src, dst) #=> :finished
4101 * puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is <err>.", ""]
4102 *
4103 * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
4104 * src = "\u{306F 3041 3068 2661 3002}" # U+2661 is not representable in iso-2022-jp
4105 * dst = ""
4106 * p ec.primitive_convert(src, dst) #=> :undefined_conversion
4107 * puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H".force_encoding("ISO-2022-JP"), "\xE3\x80\x82"]
4108 * ec.insert_output "?" # state change required to output "?".
4109 * p ec.primitive_convert(src, dst) #=> :finished
4110 * puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H\e(B?\e$B!#\e(B".force_encoding("ISO-2022-JP"), ""]
4111 *
4112 */
4113static VALUE
4114econv_insert_output(VALUE self, VALUE string)
4115{
4116 const char *insert_enc;
4117
4118 int ret;
4119
4120 rb_econv_t *ec = check_econv(self);
4121
4122 StringValue(string);
4123 insert_enc = rb_econv_encoding_to_insert_output(ec);
4124 string = rb_str_encode(string, rb_enc_from_encoding(rb_enc_find(insert_enc)), 0, Qnil);
4125
4126 ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(string), RSTRING_LEN(string), insert_enc);
4127 if (ret == -1) {
4128 rb_raise(rb_eArgError, "too big string");
4129 }
4130
4131 return Qnil;
4132}
4133
4134/*
4135 * call-seq:
4136 * ec.putback -> string
4137 * ec.putback(max_numbytes) -> string
4138 *
4139 * Put back the bytes which will be converted.
4140 *
4141 * The bytes are caused by invalid_byte_sequence error.
4142 * When invalid_byte_sequence error, some bytes are discarded and
4143 * some bytes are buffered to be converted later.
4144 * The latter bytes can be put back.
4145 * It can be observed by
4146 * Encoding::InvalidByteSequenceError#readagain_bytes and
4147 * Encoding::Converter#primitive_errinfo.
4148 *
4149 * ec = Encoding::Converter.new("utf-16le", "iso-8859-1")
4150 * src = "\x00\xd8\x61\x00"
4151 * dst = ""
4152 * p ec.primitive_convert(src, dst) #=> :invalid_byte_sequence
4153 * p ec.primitive_errinfo #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "a\x00"]
4154 * p ec.putback #=> "a\x00"
4155 * p ec.putback #=> "" # no more bytes to put back
4156 *
4157 */
4158static VALUE
4159econv_putback(int argc, VALUE *argv, VALUE self)
4160{
4161 rb_econv_t *ec = check_econv(self);
4162 int n;
4163 int putbackable;
4164 VALUE str, max;
4165
4166 if (!rb_check_arity(argc, 0, 1) || NIL_P(max = argv[0])) {
4167 n = rb_econv_putbackable(ec);
4168 }
4169 else {
4170 n = NUM2INT(max);
4171 putbackable = rb_econv_putbackable(ec);
4172 if (putbackable < n)
4173 n = putbackable;
4174 }
4175
4176 str = rb_str_new(NULL, n);
4177 rb_econv_putback(ec, (unsigned char *)RSTRING_PTR(str), n);
4178
4179 if (ec->source_encoding) {
4180 rb_enc_associate(str, ec->source_encoding);
4181 }
4182
4183 return str;
4184}
4185
4186/*
4187 * call-seq:
4188 * ec.last_error -> exception or nil
4189 *
4190 * Returns an exception object for the last conversion.
4191 * Returns nil if the last conversion did not produce an error.
4192 *
4193 * "error" means that
4194 * Encoding::InvalidByteSequenceError and Encoding::UndefinedConversionError for
4195 * Encoding::Converter#convert and
4196 * :invalid_byte_sequence, :incomplete_input and :undefined_conversion for
4197 * Encoding::Converter#primitive_convert.
4198 *
4199 * ec = Encoding::Converter.new("utf-8", "iso-8859-1")
4200 * p ec.primitive_convert(src="\xf1abcd", dst="") #=> :invalid_byte_sequence
4201 * p ec.last_error #=> #<Encoding::InvalidByteSequenceError: "\xF1" followed by "a" on UTF-8>
4202 * p ec.primitive_convert(src, dst, nil, 1) #=> :destination_buffer_full
4203 * p ec.last_error #=> nil
4204 *
4205 */
4206static VALUE
4207econv_last_error(VALUE self)
4208{
4209 rb_econv_t *ec = check_econv(self);
4210 VALUE exc;
4211
4212 exc = make_econv_exception(ec);
4213 if (NIL_P(exc))
4214 return Qnil;
4215 return exc;
4216}
4217
4218/*
4219 * call-seq:
4220 * ec.replacement -> string
4221 *
4222 * Returns the replacement string.
4223 *
4224 * ec = Encoding::Converter.new("euc-jp", "us-ascii")
4225 * p ec.replacement #=> "?"
4226 *
4227 * ec = Encoding::Converter.new("euc-jp", "utf-8")
4228 * p ec.replacement #=> "\uFFFD"
4229 */
4230static VALUE
4231econv_get_replacement(VALUE self)
4232{
4233 rb_econv_t *ec = check_econv(self);
4234 int ret;
4235 rb_encoding *enc;
4236
4237 ret = make_replacement(ec);
4238 if (ret == -1) {
4239 rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
4240 }
4241
4242 enc = rb_enc_find(ec->replacement_enc);
4243 return rb_enc_str_new((const char *)ec->replacement_str, (long)ec->replacement_len, enc);
4244}
4245
4246/*
4247 * call-seq:
4248 * ec.replacement = string
4249 *
4250 * Sets the replacement string.
4251 *
4252 * ec = Encoding::Converter.new("utf-8", "us-ascii", :undef => :replace)
4253 * ec.replacement = "<undef>"
4254 * p ec.convert("a \u3042 b") #=> "a <undef> b"
4255 */
4256static VALUE
4257econv_set_replacement(VALUE self, VALUE arg)
4258{
4259 rb_econv_t *ec = check_econv(self);
4260 VALUE string = arg;
4261 int ret;
4262 rb_encoding *enc;
4263
4264 StringValue(string);
4265 enc = rb_enc_get(string);
4266
4267 ret = rb_econv_set_replacement(ec,
4268 (const unsigned char *)RSTRING_PTR(string),
4269 RSTRING_LEN(string),
4270 rb_enc_name(enc));
4271
4272 if (ret == -1) {
4273 /* xxx: rb_eInvalidByteSequenceError? */
4274 rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
4275 }
4276
4277 return arg;
4278}
4279
4280VALUE
4282{
4283 return make_econv_exception(ec);
4284}
4285
4286void
4288{
4289 VALUE exc;
4290
4291 exc = make_econv_exception(ec);
4292 if (NIL_P(exc))
4293 return;
4294 rb_exc_raise(exc);
4295}
4296
4297/*
4298 * call-seq:
4299 * ecerr.source_encoding_name -> string
4300 *
4301 * Returns the source encoding name as a string.
4302 */
4303static VALUE
4304ecerr_source_encoding_name(VALUE self)
4305{
4306 return rb_attr_get(self, id_source_encoding_name);
4307}
4308
4309/*
4310 * call-seq:
4311 * ecerr.source_encoding -> encoding
4312 *
4313 * Returns the source encoding as an encoding object.
4314 *
4315 * Note that the result may not be equal to the source encoding of
4316 * the encoding converter if the conversion has multiple steps.
4317 *
4318 * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP") # ISO-8859-1 -> UTF-8 -> EUC-JP
4319 * begin
4320 * ec.convert("\xa0") # NO-BREAK SPACE, which is available in UTF-8 but not in EUC-JP.
4321 * rescue Encoding::UndefinedConversionError
4322 * p $!.source_encoding #=> #<Encoding:UTF-8>
4323 * p $!.destination_encoding #=> #<Encoding:EUC-JP>
4324 * p $!.source_encoding_name #=> "UTF-8"
4325 * p $!.destination_encoding_name #=> "EUC-JP"
4326 * end
4327 *
4328 */
4329static VALUE
4330ecerr_source_encoding(VALUE self)
4331{
4332 return rb_attr_get(self, id_source_encoding);
4333}
4334
4335/*
4336 * call-seq:
4337 * ecerr.destination_encoding_name -> string
4338 *
4339 * Returns the destination encoding name as a string.
4340 */
4341static VALUE
4342ecerr_destination_encoding_name(VALUE self)
4343{
4344 return rb_attr_get(self, id_destination_encoding_name);
4345}
4346
4347/*
4348 * call-seq:
4349 * ecerr.destination_encoding -> string
4350 *
4351 * Returns the destination encoding as an encoding object.
4352 */
4353static VALUE
4354ecerr_destination_encoding(VALUE self)
4355{
4356 return rb_attr_get(self, id_destination_encoding);
4357}
4358
4359/*
4360 * call-seq:
4361 * ecerr.error_char -> string
4362 *
4363 * Returns the one-character string which cause Encoding::UndefinedConversionError.
4364 *
4365 * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP")
4366 * begin
4367 * ec.convert("\xa0")
4368 * rescue Encoding::UndefinedConversionError
4369 * puts $!.error_char.dump #=> "\xC2\xA0"
4370 * p $!.error_char.encoding #=> #<Encoding:UTF-8>
4371 * end
4372 *
4373 */
4374static VALUE
4375ecerr_error_char(VALUE self)
4376{
4377 return rb_attr_get(self, id_error_char);
4378}
4379
4380/*
4381 * call-seq:
4382 * ecerr.error_bytes -> string
4383 *
4384 * Returns the discarded bytes when Encoding::InvalidByteSequenceError occurs.
4385 *
4386 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4387 * begin
4388 * ec.convert("abc\xA1\xFFdef")
4389 * rescue Encoding::InvalidByteSequenceError
4390 * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "\xFF" on EUC-JP>
4391 * puts $!.error_bytes.dump #=> "\xA1"
4392 * puts $!.readagain_bytes.dump #=> "\xFF"
4393 * end
4394 */
4395static VALUE
4396ecerr_error_bytes(VALUE self)
4397{
4398 return rb_attr_get(self, id_error_bytes);
4399}
4400
4401/*
4402 * call-seq:
4403 * ecerr.readagain_bytes -> string
4404 *
4405 * Returns the bytes to be read again when Encoding::InvalidByteSequenceError occurs.
4406 */
4407static VALUE
4408ecerr_readagain_bytes(VALUE self)
4409{
4410 return rb_attr_get(self, id_readagain_bytes);
4411}
4412
4413/*
4414 * call-seq:
4415 * ecerr.incomplete_input? -> true or false
4416 *
4417 * Returns true if the invalid byte sequence error is caused by
4418 * premature end of string.
4419 *
4420 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4421 *
4422 * begin
4423 * ec.convert("abc\xA1z")
4424 * rescue Encoding::InvalidByteSequenceError
4425 * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "z" on EUC-JP>
4426 * p $!.incomplete_input? #=> false
4427 * end
4428 *
4429 * begin
4430 * ec.convert("abc\xA1")
4431 * ec.finish
4432 * rescue Encoding::InvalidByteSequenceError
4433 * p $! #=> #<Encoding::InvalidByteSequenceError: incomplete "\xA1" on EUC-JP>
4434 * p $!.incomplete_input? #=> true
4435 * end
4436 */
4437static VALUE
4438ecerr_incomplete_input(VALUE self)
4439{
4440 return rb_attr_get(self, id_incomplete_input);
4441}
4442
4443/*
4444 * Document-class: Encoding::UndefinedConversionError
4445 *
4446 * Raised by Encoding and String methods when a transcoding operation
4447 * fails.
4448 */
4449
4450/*
4451 * Document-class: Encoding::InvalidByteSequenceError
4452 *
4453 * Raised by Encoding and String methods when the string being
4454 * transcoded contains a byte invalid for the either the source or
4455 * target encoding.
4456 */
4457
4458/*
4459 * Document-class: Encoding::ConverterNotFoundError
4460 *
4461 * Raised by transcoding methods when a named encoding does not
4462 * correspond with a known converter.
4463 */
4464
4465void
4466Init_transcode(void)
4467{
4468 transcoder_table = st_init_strcasetable();
4469
4470 id_destination_encoding = rb_intern_const("destination_encoding");
4471 id_destination_encoding_name = rb_intern_const("destination_encoding_name");
4472 id_error_bytes = rb_intern_const("error_bytes");
4473 id_error_char = rb_intern_const("error_char");
4474 id_incomplete_input = rb_intern_const("incomplete_input");
4475 id_readagain_bytes = rb_intern_const("readagain_bytes");
4476 id_source_encoding = rb_intern_const("source_encoding");
4477 id_source_encoding_name = rb_intern_const("source_encoding_name");
4478
4479 sym_invalid = ID2SYM(rb_intern_const("invalid"));
4480 sym_undef = ID2SYM(rb_intern_const("undef"));
4481 sym_replace = ID2SYM(rb_intern_const("replace"));
4482 sym_fallback = ID2SYM(rb_intern_const("fallback"));
4483 sym_xml = ID2SYM(rb_intern_const("xml"));
4484 sym_text = ID2SYM(rb_intern_const("text"));
4485 sym_attr = ID2SYM(rb_intern_const("attr"));
4486
4487 sym_invalid_byte_sequence = ID2SYM(rb_intern_const("invalid_byte_sequence"));
4488 sym_undefined_conversion = ID2SYM(rb_intern_const("undefined_conversion"));
4489 sym_destination_buffer_full = ID2SYM(rb_intern_const("destination_buffer_full"));
4490 sym_source_buffer_empty = ID2SYM(rb_intern_const("source_buffer_empty"));
4491 sym_finished = ID2SYM(rb_intern_const("finished"));
4492 sym_after_output = ID2SYM(rb_intern_const("after_output"));
4493 sym_incomplete_input = ID2SYM(rb_intern_const("incomplete_input"));
4494 sym_universal_newline = ID2SYM(rb_intern_const("universal_newline"));
4495 sym_crlf_newline = ID2SYM(rb_intern_const("crlf_newline"));
4496 sym_cr_newline = ID2SYM(rb_intern_const("cr_newline"));
4497 sym_partial_input = ID2SYM(rb_intern_const("partial_input"));
4498
4499#ifdef ENABLE_ECONV_NEWLINE_OPTION
4500 sym_newline = ID2SYM(rb_intern_const("newline"));
4501 sym_universal = ID2SYM(rb_intern_const("universal"));
4502 sym_crlf = ID2SYM(rb_intern_const("crlf"));
4503 sym_cr = ID2SYM(rb_intern_const("cr"));
4504 sym_lf = ID2SYM(rb_intern_const("lf"));
4505#endif
4506
4507 InitVM(transcode);
4508}
4509
4510void
4511InitVM_transcode(void)
4512{
4513 rb_eUndefinedConversionError = rb_define_class_under(rb_cEncoding, "UndefinedConversionError", rb_eEncodingError);
4514 rb_eInvalidByteSequenceError = rb_define_class_under(rb_cEncoding, "InvalidByteSequenceError", rb_eEncodingError);
4515 rb_eConverterNotFoundError = rb_define_class_under(rb_cEncoding, "ConverterNotFoundError", rb_eEncodingError);
4516
4517 rb_define_method(rb_cString, "encode", str_encode, -1);
4518 rb_define_method(rb_cString, "encode!", str_encode_bang, -1);
4519
4520 rb_cEncodingConverter = rb_define_class_under(rb_cEncoding, "Converter", rb_cObject);
4521 rb_define_alloc_func(rb_cEncodingConverter, econv_s_allocate);
4522 rb_define_singleton_method(rb_cEncodingConverter, "asciicompat_encoding", econv_s_asciicompat_encoding, 1);
4523 rb_define_singleton_method(rb_cEncodingConverter, "search_convpath", econv_s_search_convpath, -1);
4524 rb_define_method(rb_cEncodingConverter, "initialize", econv_init, -1);
4525 rb_define_method(rb_cEncodingConverter, "inspect", econv_inspect, 0);
4526 rb_define_method(rb_cEncodingConverter, "convpath", econv_convpath, 0);
4527 rb_define_method(rb_cEncodingConverter, "source_encoding", econv_source_encoding, 0);
4528 rb_define_method(rb_cEncodingConverter, "destination_encoding", econv_destination_encoding, 0);
4529 rb_define_method(rb_cEncodingConverter, "primitive_convert", econv_primitive_convert, -1);
4530 rb_define_method(rb_cEncodingConverter, "convert", econv_convert, 1);
4531 rb_define_method(rb_cEncodingConverter, "finish", econv_finish, 0);
4532 rb_define_method(rb_cEncodingConverter, "primitive_errinfo", econv_primitive_errinfo, 0);
4533 rb_define_method(rb_cEncodingConverter, "insert_output", econv_insert_output, 1);
4534 rb_define_method(rb_cEncodingConverter, "putback", econv_putback, -1);
4535 rb_define_method(rb_cEncodingConverter, "last_error", econv_last_error, 0);
4536 rb_define_method(rb_cEncodingConverter, "replacement", econv_get_replacement, 0);
4537 rb_define_method(rb_cEncodingConverter, "replacement=", econv_set_replacement, 1);
4538 rb_define_method(rb_cEncodingConverter, "==", econv_equal, 1);
4539
4540 /* Document-const: INVALID_MASK
4541 *
4542 * Mask for invalid byte sequences
4543 */
4544 rb_define_const(rb_cEncodingConverter, "INVALID_MASK", INT2FIX(ECONV_INVALID_MASK));
4545
4546 /* Document-const: INVALID_REPLACE
4547 *
4548 * Replace invalid byte sequences
4549 */
4550 rb_define_const(rb_cEncodingConverter, "INVALID_REPLACE", INT2FIX(ECONV_INVALID_REPLACE));
4551
4552 /* Document-const: UNDEF_MASK
4553 *
4554 * Mask for a valid character in the source encoding but no related
4555 * character(s) in destination encoding.
4556 */
4557 rb_define_const(rb_cEncodingConverter, "UNDEF_MASK", INT2FIX(ECONV_UNDEF_MASK));
4558
4559 /* Document-const: UNDEF_REPLACE
4560 *
4561 * Replace byte sequences that are undefined in the destination encoding.
4562 */
4563 rb_define_const(rb_cEncodingConverter, "UNDEF_REPLACE", INT2FIX(ECONV_UNDEF_REPLACE));
4564
4565 /* Document-const: UNDEF_HEX_CHARREF
4566 *
4567 * Replace byte sequences that are undefined in the destination encoding
4568 * with an XML hexadecimal character reference. This is valid for XML
4569 * conversion.
4570 */
4571 rb_define_const(rb_cEncodingConverter, "UNDEF_HEX_CHARREF", INT2FIX(ECONV_UNDEF_HEX_CHARREF));
4572
4573 /* Document-const: PARTIAL_INPUT
4574 *
4575 * Indicates the source may be part of a larger string. See
4576 * primitive_convert for an example.
4577 */
4578 rb_define_const(rb_cEncodingConverter, "PARTIAL_INPUT", INT2FIX(ECONV_PARTIAL_INPUT));
4579
4580 /* Document-const: AFTER_OUTPUT
4581 *
4582 * Stop converting after some output is complete but before all of the
4583 * input was consumed. See primitive_convert for an example.
4584 */
4585 rb_define_const(rb_cEncodingConverter, "AFTER_OUTPUT", INT2FIX(ECONV_AFTER_OUTPUT));
4586
4587 /* Document-const: UNIVERSAL_NEWLINE_DECORATOR
4588 *
4589 * Decorator for converting CRLF and CR to LF
4590 */
4591 rb_define_const(rb_cEncodingConverter, "UNIVERSAL_NEWLINE_DECORATOR", INT2FIX(ECONV_UNIVERSAL_NEWLINE_DECORATOR));
4592
4593 /* Document-const: CRLF_NEWLINE_DECORATOR
4594 *
4595 * Decorator for converting LF to CRLF
4596 */
4597 rb_define_const(rb_cEncodingConverter, "CRLF_NEWLINE_DECORATOR", INT2FIX(ECONV_CRLF_NEWLINE_DECORATOR));
4598
4599 /* Document-const: CR_NEWLINE_DECORATOR
4600 *
4601 * Decorator for converting LF to CR
4602 */
4603 rb_define_const(rb_cEncodingConverter, "CR_NEWLINE_DECORATOR", INT2FIX(ECONV_CR_NEWLINE_DECORATOR));
4604
4605 /* Document-const: XML_TEXT_DECORATOR
4606 *
4607 * Escape as XML CharData
4608 */
4609 rb_define_const(rb_cEncodingConverter, "XML_TEXT_DECORATOR", INT2FIX(ECONV_XML_TEXT_DECORATOR));
4610
4611 /* Document-const: XML_ATTR_CONTENT_DECORATOR
4612 *
4613 * Escape as XML AttValue
4614 */
4615 rb_define_const(rb_cEncodingConverter, "XML_ATTR_CONTENT_DECORATOR", INT2FIX(ECONV_XML_ATTR_CONTENT_DECORATOR));
4616
4617 /* Document-const: XML_ATTR_QUOTE_DECORATOR
4618 *
4619 * Escape as XML AttValue
4620 */
4621 rb_define_const(rb_cEncodingConverter, "XML_ATTR_QUOTE_DECORATOR", INT2FIX(ECONV_XML_ATTR_QUOTE_DECORATOR));
4622
4623 rb_define_method(rb_eUndefinedConversionError, "source_encoding_name", ecerr_source_encoding_name, 0);
4624 rb_define_method(rb_eUndefinedConversionError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
4625 rb_define_method(rb_eUndefinedConversionError, "source_encoding", ecerr_source_encoding, 0);
4626 rb_define_method(rb_eUndefinedConversionError, "destination_encoding", ecerr_destination_encoding, 0);
4627 rb_define_method(rb_eUndefinedConversionError, "error_char", ecerr_error_char, 0);
4628
4629 rb_define_method(rb_eInvalidByteSequenceError, "source_encoding_name", ecerr_source_encoding_name, 0);
4630 rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
4631 rb_define_method(rb_eInvalidByteSequenceError, "source_encoding", ecerr_source_encoding, 0);
4632 rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding", ecerr_destination_encoding, 0);
4633 rb_define_method(rb_eInvalidByteSequenceError, "error_bytes", ecerr_error_bytes, 0);
4634 rb_define_method(rb_eInvalidByteSequenceError, "readagain_bytes", ecerr_readagain_bytes, 0);
4635 rb_define_method(rb_eInvalidByteSequenceError, "incomplete_input?", ecerr_incomplete_input, 0);
4636
4637 Init_newline();
4638}
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
Definition: cxxanyargs.hpp:685
VALUE rb_define_class_under(VALUE outer, const char *name, VALUE super)
Defines a class under the namespace of outer.
Definition: class.c:869
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition: class.c:2406
void rb_define_method(VALUE klass, const char *name, VALUE(*func)(ANYARGS), int argc)
Defines a method.
Definition: class.c:1914
#define ECONV_XML_ATTR_QUOTE_DECORATOR
Old name of RUBY_ECONV_XML_ATTR_QUOTE_DECORATOR.
Definition: transcode.h:535
#define ECONV_AFTER_OUTPUT
Old name of RUBY_ECONV_AFTER_OUTPUT.
Definition: transcode.h:551
#define rb_str_new2
Old name of rb_str_new_cstr.
Definition: string.h:1738
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition: coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition: coderange.h:181
#define ECONV_UNIVERSAL_NEWLINE_DECORATOR
Old name of RUBY_ECONV_UNIVERSAL_NEWLINE_DECORATOR.
Definition: transcode.h:529
#define REALLOC_N
Old name of RB_REALLOC_N.
Definition: memory.h:397
#define ALLOC
Old name of RB_ALLOC.
Definition: memory.h:394
#define xfree
Old name of ruby_xfree.
Definition: xmalloc.h:58
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
Definition: long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition: fl_type.h:145
#define ECONV_XML_ATTR_CONTENT_DECORATOR
Old name of RUBY_ECONV_XML_ATTR_CONTENT_DECORATOR.
Definition: transcode.h:533
#define ECONV_INVALID_MASK
Old name of RUBY_ECONV_INVALID_MASK.
Definition: transcode.h:520
#define ECONV_CRLF_NEWLINE_DECORATOR
Old name of RUBY_ECONV_CRLF_NEWLINE_DECORATOR.
Definition: transcode.h:530
#define xrealloc
Old name of ruby_xrealloc.
Definition: xmalloc.h:56
#define ID2SYM
Old name of RB_ID2SYM.
Definition: symbol.h:44
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition: fl_type.h:143
#define ECONV_UNDEF_REPLACE
Old name of RUBY_ECONV_UNDEF_REPLACE.
Definition: transcode.h:523
#define ECONV_XML_TEXT_DECORATOR
Old name of RUBY_ECONV_XML_TEXT_DECORATOR.
Definition: transcode.h:532
#define rb_ary_new4
Old name of rb_ary_new_from_values.
Definition: array.h:653
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition: coderange.h:179
#define ECONV_CR_NEWLINE_DECORATOR
Old name of RUBY_ECONV_CR_NEWLINE_DECORATOR.
Definition: transcode.h:531
#define xmalloc
Old name of ruby_xmalloc.
Definition: xmalloc.h:53
#define ECONV_INVALID_REPLACE
Old name of RUBY_ECONV_INVALID_REPLACE.
Definition: transcode.h:521
#define T_HASH
Old name of RUBY_T_HASH.
Definition: value_type.h:65
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition: memory.h:393
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition: encoding.h:533
#define rb_exc_new3
Old name of rb_exc_new_str.
Definition: error.h:38
#define ECONV_UNDEF_MASK
Old name of RUBY_ECONV_UNDEF_MASK.
Definition: transcode.h:522
#define Qtrue
Old name of RUBY_Qtrue.
#define ECONV_PARTIAL_INPUT
Old name of RUBY_ECONV_PARTIAL_INPUT.
Definition: transcode.h:550
#define NUM2INT
Old name of RB_NUM2INT.
Definition: int.h:44
#define ECONV_ERROR_HANDLER_MASK
Old name of RUBY_ECONV_ERROR_HANDLER_MASK.
Definition: transcode.h:519
#define INT2NUM
Old name of RB_INT2NUM.
Definition: int.h:43
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition: coderange.h:182
#define T_ARRAY
Old name of RUBY_T_ARRAY.
Definition: value_type.h:56
#define NIL_P
Old name of RB_NIL_P.
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition: encoding.h:532
#define ECONV_UNDEF_HEX_CHARREF
Old name of RUBY_ECONV_UNDEF_HEX_CHARREF.
Definition: transcode.h:524
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition: long.h:51
#define ECONV_NEWLINE_DECORATOR_MASK
Old name of RUBY_ECONV_NEWLINE_DECORATOR_MASK.
Definition: transcode.h:526
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition: array.h:651
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition: coderange.h:186
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition: value_type.h:88
void rb_raise(VALUE exc, const char *fmt,...)
Exception entry point.
Definition: error.c:3021
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition: eval.c:671
void rb_bug(const char *fmt,...)
Interpreter panic switch.
Definition: error.c:802
void rb_warning(const char *fmt,...)
Issues a warning.
Definition: error.c:449
VALUE rb_cEncoding
Encoding class.
Definition: encoding.c:57
VALUE rb_cString
String class.
Definition: string.c:80
Encoding relates APIs.
int rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc)
Queries the number of bytes of the character at the passed pointer.
Definition: encoding.c:1234
int rb_enc_get_index(VALUE obj)
Queries the index of the encoding of the passed object, if any.
Definition: encoding.c:979
int rb_to_encoding_index(VALUE obj)
Obtains a encoding index from a wider range of objects (than rb_enc_find_index()).
Definition: encoding.c:267
VALUE rb_enc_associate(VALUE obj, rb_encoding *enc)
Identical to rb_enc_associate(), except it takes an encoding itself instead of its index.
Definition: encoding.c:1066
rb_encoding * rb_utf8_encoding(void)
Queries the encoding that represents UTF-8.
Definition: encoding.c:1527
rb_encoding * rb_to_encoding(VALUE obj)
Identical to rb_find_encoding(), except it raises an exception instead of returning NULL.
Definition: encoding.c:329
static const char * rb_enc_name(rb_encoding *enc)
Queries the (canonical) name of the passed encoding.
Definition: encoding.h:433
rb_encoding * rb_enc_get(VALUE obj)
Identical to rb_enc_get_index(), except the return type.
Definition: encoding.c:1072
rb_encoding * rb_enc_from_index(int idx)
Identical to rb_find_encoding(), except it takes an encoding index instead of a Ruby object.
Definition: encoding.c:414
rb_encoding * rb_enc_find(const char *name)
Identical to rb_find_encoding(), except it takes a C's string instead of Ruby's.
Definition: encoding.c:918
VALUE rb_enc_from_encoding(rb_encoding *enc)
Queries the Ruby-level counterpart instance of rb_cEncoding that corresponds to the passed encoding.
Definition: encoding.c:188
static bool rb_enc_asciicompat(rb_encoding *enc)
Queries if the passed encoding is in some sense compatible with ASCII.
Definition: encoding.h:782
int rb_define_dummy_encoding(const char *name)
Creates a new "dummy" encoding.
Definition: encoding.c:617
VALUE rb_enc_default_internal(void)
Identical to rb_default_internal_encoding(), except it returns the Ruby-level counterpart instance of...
Definition: encoding.c:1733
VALUE rb_enc_associate_index(VALUE obj, int encindex)
Identical to rb_enc_set_index(), except it additionally does contents fix-up depending on the passed ...
Definition: encoding.c:1038
static OnigCodePoint rb_enc_mbc_to_codepoint(const char *p, const char *e, rb_encoding *enc)
Identical to rb_enc_codepoint(), except it assumes the passed character is not broken.
Definition: encoding.h:607
int rb_enc_find_index(const char *name)
Queries the index of the encoding.
Definition: encoding.c:881
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition: string.c:1182
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition: string.c:776
VALUE rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it additionally takes an encoding.
Definition: string.c:940
VALUE rb_obj_encoding(VALUE obj)
Identical to rb_enc_get_index(), except the return type.
Definition: encoding.c:1206
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition: string.c:668
int rb_econv_prepare_options(VALUE opthash, VALUE *ecopts, int ecflags)
Identical to rb_econv_prepare_opts(), except it additionally takes the initial value of flags.
Definition: transcode.c:2563
VALUE rb_econv_open_exc(const char *senc, const char *denc, int ecflags)
Creates a rb_eConverterNotFoundError exception object (but does not raise).
Definition: transcode.c:2065
const char * rb_econv_encoding_to_insert_output(rb_econv_t *ec)
Queries an encoding name which best suits for rb_econv_insert_output()'s last parameter.
Definition: transcode.c:1506
int rb_econv_prepare_opts(VALUE opthash, VALUE *ecopts)
Splits a keyword arguments hash (that for instance String#encode took) into a set of enum ruby_econv_...
Definition: transcode.c:2608
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition: transcode.c:1449
@ econv_incomplete_input
The conversion stopped in middle of reading a character, possibly due to a partial read of a socket e...
Definition: transcode.h:69
@ econv_finished
The conversion stopped after converting everything.
Definition: transcode.h:57
@ econv_undefined_conversion
The conversion stopped when it found a character in the input which cannot be representable in the ou...
Definition: transcode.h:41
@ econv_after_output
The conversion stopped after writing something to somewhere, before reading everything.
Definition: transcode.h:63
@ econv_source_buffer_empty
The conversion stopped because there is no input.
Definition: transcode.h:51
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition: transcode.h:46
@ econv_invalid_byte_sequence
The conversion stopped when it found an invalid sequence.
Definition: transcode.h:35
int rb_econv_putbackable(rb_econv_t *ec)
Queries if rb_econv_putback() makes sense, i.e.
Definition: transcode.c:1745
int rb_econv_has_convpath_p(const char *from_encoding, const char *to_encoding)
Queries if there is more than one way to convert between the passed two encodings.
Definition: transcode.c:3226
rb_econv_t * rb_econv_open(const char *source_encoding, const char *destination_encoding, int ecflags)
Creates a new instance of struct rb_econv_t.
Definition: transcode.c:1072
VALUE rb_econv_str_append(rb_econv_t *ec, VALUE src, VALUE dst, int flags)
Identical to rb_econv_str_convert(), except it appends the conversion result to the additionally pass...
Definition: transcode.c:1894
VALUE rb_econv_substr_append(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, VALUE dst, int flags)
Identical to rb_econv_str_append(), except it appends only a part of the passed string with conversio...
Definition: transcode.c:1885
const char * rb_econv_asciicompat_encoding(const char *encname)
Queries the passed encoding's corresponding ASCII compatible encoding.
Definition: transcode.c:1789
int rb_econv_insert_output(rb_econv_t *ec, const unsigned char *str, size_t len, const char *str_encoding)
Appends the passed string to the passed converter's output buffer.
Definition: transcode.c:1590
VALUE rb_econv_str_convert(rb_econv_t *ec, VALUE src, int flags)
Identical to rb_econv_convert(), except it takes Ruby's string instead of C's pointer.
Definition: transcode.c:1906
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition: transcode.c:2614
int rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name)
Identical to rb_econv_decorate_at_first(), except it adds to the opposite direction.
Definition: transcode.c:1954
void rb_econv_binmode(rb_econv_t *ec)
This badly named function does not set the destination encoding to binary, but instead just nullifies...
Definition: transcode.c:1971
int rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name)
"Decorate"s a converter.
Definition: transcode.c:1937
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition: transcode.c:2929
VALUE rb_econv_make_exception(rb_econv_t *ec)
This function makes sense right after rb_econv_convert() returns.
Definition: transcode.c:4281
struct rb_econv_t rb_econv_t
An opaque struct that represents a lowest level of encoding conversion.
Definition: transcode.h:73
void rb_econv_check_error(rb_econv_t *ec)
This is a rb_econv_make_exception() + rb_exc_raise() combo.
Definition: transcode.c:4287
VALUE rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags)
Identical to rb_econv_str_convert(), except it converts only a part of the passed string.
Definition: transcode.c:1900
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition: transcode.c:1705
VALUE rb_econv_append(rb_econv_t *ec, const char *bytesrc, long bytesize, VALUE dst, int flags)
Converts the passed C's pointer according to the passed converter, then append the conversion result ...
Definition: transcode.c:1822
void rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n)
Puts back the bytes.
Definition: transcode.c:1756
int rb_econv_set_replacement(rb_econv_t *ec, const unsigned char *str, size_t len, const char *encname)
Assigns the replacement string.
Definition: transcode.c:2227
VALUE rb_funcallv_public(VALUE recv, ID mid, int argc, const VALUE *argv)
Identical to rb_funcallv(), except it only takes public methods into account.
Definition: vm_eval.c:1153
VALUE rb_check_array_type(VALUE obj)
Try converting an object to its array representation using its to_ary method, if any.
Definition: array.c:989
VALUE rb_ary_new(void)
Allocates a new, empty array.
Definition: array.c:750
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
Definition: array.c:1308
VALUE rb_ary_entry(VALUE ary, long off)
Queries an element of an array.
Definition: array.c:1679
VALUE rb_assoc_new(VALUE car, VALUE cdr)
Identical to rb_ary_new_from_values(), except it expects exactly two parameters.
Definition: array.c:976
void rb_ary_store(VALUE ary, long key, VALUE val)
Destructively stores the passed value to the passed array's passed index.
Definition: array.c:1148
#define rb_check_frozen
Just another name of rb_check_frozen.
Definition: error.h:278
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition: error.h:294
VALUE rb_check_hash_type(VALUE obj)
Try converting an object to its hash representation using its to_hash method, if any.
Definition: hash.c:1896
VALUE rb_hash_freeze(VALUE obj)
Just another name of rb_obj_freeze.
Definition: hash.c:87
VALUE rb_hash_aref(VALUE hash, VALUE key)
Queries the given key in the given hash table.
Definition: hash.c:2082
VALUE rb_hash_aset(VALUE hash, VALUE key, VALUE val)
Inserts or replaces ("upsert"s) the objects into the given hash table.
Definition: hash.c:2903
VALUE rb_hash_new(void)
Creates a new, empty hash object.
Definition: hash.c:1529
VALUE rb_proc_call(VALUE recv, VALUE args)
Evaluates the passed proc with the passed arguments.
Definition: proc.c:1003
VALUE rb_obj_is_method(VALUE recv)
Queries if the given object is a method.
Definition: proc.c:1600
VALUE rb_method_call(int argc, const VALUE *argv, VALUE recv)
Evaluates the passed method with the passed arguments.
Definition: proc.c:2423
VALUE rb_obj_is_proc(VALUE recv)
Queries if the given object is a proc.
Definition: proc.c:175
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition: string.c:1540
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition: string.c:1593
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition: string.c:828
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition: string.c:1356
VALUE rb_str_cat2(VALUE, const char *)
Just another name of rb_str_cat_cstr.
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition: string.c:1808
void rb_str_modify(VALUE str)
Declares that the string is about to be modified.
Definition: string.c:2459
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition: string.c:3039
VALUE rb_str_new(const char *ptr, long len)
Allocates an instance of rb_cString.
Definition: string.c:918
VALUE rb_str_new_cstr(const char *ptr)
Identical to rb_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition: string.c:952
VALUE rb_str_resize(VALUE str, long len)
Overwrites the length of the string.
Definition: string.c:3056
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition: string.c:2467
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition: string.c:6567
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition: string.c:1506
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition: string.c:5071
VALUE rb_attr_get(VALUE obj, ID name)
Identical to rb_ivar_get()
Definition: variable.c:1293
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
Definition: variable.c:1575
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition: vm_method.c:2765
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition: symbol.h:276
VALUE rb_sym2str(VALUE id)
Identical to rb_id2str(), except it takes an instance of rb_cSymbol rather than an ID.
Definition: symbol.c:924
void rb_define_const(VALUE klass, const char *name, VALUE val)
Defines a Ruby level constant under a namespace.
Definition: variable.c:3253
VALUE rb_sprintf(const char *fmt,...)
Ruby's extended sprintf(3).
Definition: sprintf.c:1201
VALUE rb_str_catf(VALUE dst, const char *fmt,...)
Identical to rb_sprintf(), except it renders the output to the specified object rather than creating ...
Definition: sprintf.c:1241
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition: memory.h:366
#define ALLOCA_N(type, n)
Definition: memory.h:286
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition: memory.h:161
#define MEMMOVE(p1, p2, type, n)
Handy macro to call memmove.
Definition: memory.h:378
int st_foreach(st_table *q, int_type *w, st_data_t e)
Iteration over the given table.
Definition: cxxanyargs.hpp:432
#define RARRAY_LEN
Just another name of rb_array_len.
Definition: rarray.h:68
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition: rarray.h:324
#define RARRAY_AREF(a, i)
Definition: rarray.h:588
#define DATA_PTR(obj)
Convenient getter macro.
Definition: rdata.h:71
@ RSTRING_EMBED_LEN_MAX
Max possible number of characters that can be embedded.
Definition: rstring.h:215
#define StringValue(v)
Ensures that the parameter object is a String.
Definition: rstring.h:72
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition: rstring.h:527
static long RSTRING_LEN(VALUE str)
Queries the length of the string.
Definition: rstring.h:483
static char * RSTRING_PTR(VALUE str)
Queries the contents pointer of the string.
Definition: rstring.h:497
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition: rstring.h:95
#define TypedData_Get_Struct(obj, type, data_type, sval)
Obtains a C struct from inside of a wrapper Ruby object.
Definition: rtypeddata.h:507
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition: rtypeddata.h:441
const char * rb_obj_classname(VALUE obj)
Queries the name of the class of the passed object.
Definition: variable.c:309
#define InitVM(ext)
This macro is for internal use.
Definition: ruby.h:229
#define RTEST
This is an old name of RB_TEST.
This is the struct that holds necessary info for a struct.
Definition: string.c:7520
Definition: transcode.c:174
uintptr_t VALUE
Type that represents a Ruby object.
Definition: value.h:40
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition: value_type.h:375