Ruby 3.1.3p185 (2022-11-24 revision 1a6b16756e0ba6b95ab71a441357ed5484e33498)
encoding.h
Go to the documentation of this file.
1#ifndef RUBY_INTERNAL_ENCODING_ENCODING_H /*-*-C++-*-vi:se ft=cpp:*/
2#define RUBY_INTERNAL_ENCODING_ENCODING_H
24#include "ruby/oniguruma.h"
31#include "ruby/internal/value.h"
34
36
37
43
50enum ruby_encoding_consts {
51
53 RUBY_ENCODING_INLINE_MAX = 127,
54
56 RUBY_ENCODING_SHIFT = (RUBY_FL_USHIFT+10),
57
59 RUBY_ENCODING_MASK = (RUBY_ENCODING_INLINE_MAX<<RUBY_ENCODING_SHIFT
60 /* RUBY_FL_USER10..RUBY_FL_USER16 */),
61
63 RUBY_ENCODING_MAXNAMELEN = 42
64};
65
66#define ENCODING_INLINE_MAX RUBY_ENCODING_INLINE_MAX
67#define ENCODING_SHIFT RUBY_ENCODING_SHIFT
68#define ENCODING_MASK RUBY_ENCODING_MASK
79static inline void
80RB_ENCODING_SET_INLINED(VALUE obj, int encindex)
81{
82 VALUE f = /* upcast */ encindex;
83
84 f <<= RUBY_ENCODING_SHIFT;
85 RB_FL_UNSET_RAW(obj, RUBY_ENCODING_MASK);
86 RB_FL_SET_RAW(obj, f);
87}
88
97static inline int
99{
100 VALUE ret = RB_FL_TEST_RAW(obj, RUBY_ENCODING_MASK) >> RUBY_ENCODING_SHIFT;
101
102 return RBIMPL_CAST((int)ret);
103}
104
105#define ENCODING_SET_INLINED(obj,i) RB_ENCODING_SET_INLINED(obj,i)
106#define ENCODING_SET(obj,i) RB_ENCODING_SET(obj,i)
107#define ENCODING_GET_INLINED(obj) RB_ENCODING_GET_INLINED(obj)
108#define ENCODING_GET(obj) RB_ENCODING_GET(obj)
109#define ENCODING_IS_ASCII8BIT(obj) RB_ENCODING_IS_ASCII8BIT(obj)
110#define ENCODING_MAXNAMELEN RUBY_ENCODING_MAXNAMELEN
117
139int rb_char_to_option_kcode(int c, int *option, int *kcode);
140
156int rb_enc_replicate(const char *name, rb_encoding *src);
157
169int rb_define_dummy_encoding(const char *name);
170
180
192
200int rb_enc_get_index(VALUE obj);
201
210static inline int
212{
213 int encindex = RB_ENCODING_GET_INLINED(obj);
214
215 if (encindex == RUBY_ENCODING_INLINE_MAX) {
216 return rb_enc_get_index(obj);
217 }
218 else {
219 return encindex;
220 }
221}
222
233void rb_enc_set_index(VALUE obj, int encindex);
234
236static inline void
237RB_ENCODING_SET(VALUE obj, int encindex)
238{
239 rb_enc_set_index(obj, encindex);
240}
241
253static inline void
254RB_ENCODING_CODERANGE_SET(VALUE obj, int encindex, enum ruby_coderange_type cr)
255{
256 RB_ENCODING_SET(obj, encindex);
257 RB_ENC_CODERANGE_SET(obj, cr);
258}
259
268int rb_enc_capable(VALUE obj);
269
278int rb_enc_find_index(const char *name);
279
293int rb_enc_alias(const char *alias, const char *orig);
294
303int rb_to_encoding_index(VALUE obj);
304
314rb_encoding *rb_to_encoding(VALUE obj);
315
324rb_encoding *rb_find_encoding(VALUE obj);
325
333rb_encoding *rb_enc_get(VALUE obj);
334
347rb_encoding *rb_enc_compatible(VALUE str1, VALUE str2);
348
359rb_encoding *rb_enc_check(VALUE str1,VALUE str2);
360
375VALUE rb_enc_associate_index(VALUE obj, int encindex);
376
388VALUE rb_enc_associate(VALUE obj, rb_encoding *enc);
389
403void rb_enc_copy(VALUE dst, VALUE src);
404
405
415
424rb_encoding *rb_enc_find(const char *name);
425
432static inline const char *
434{
435 return enc->name;
436}
437
447static inline int
449{
450 return enc->min_enc_len;
451}
452
462static inline int
464{
465 return enc->max_enc_len;
466}
467
484int rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc);
485
502int rb_enc_fast_mbclen(const char *p, const char *e, rb_encoding *enc);
503
530int rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc);
531
532#define MBCLEN_CHARFOUND_P(ret) ONIGENC_MBCLEN_CHARFOUND_P(ret)
533#define MBCLEN_CHARFOUND_LEN(ret) ONIGENC_MBCLEN_CHARFOUND_LEN(ret)
534#define MBCLEN_INVALID_P(ret) ONIGENC_MBCLEN_INVALID_P(ret)
535#define MBCLEN_NEEDMORE_P(ret) ONIGENC_MBCLEN_NEEDMORE_P(ret)
536#define MBCLEN_NEEDMORE_LEN(ret) ONIGENC_MBCLEN_NEEDMORE_LEN(ret)
552int rb_enc_ascget(const char *p, const char *e, int *len, rb_encoding *enc);
553
566unsigned int rb_enc_codepoint_len(const char *p, const char *e, int *len, rb_encoding *enc);
567
586static inline unsigned int
587rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
588{
589 return rb_enc_codepoint_len(p, e, 0, enc);
590 /* ^^^
591 * This can be `NULL` in C, `nullptr` in C++, and `0` for both.
592 * We choose the most portable one here.
593 */
594}
595
596
606static inline OnigCodePoint
607rb_enc_mbc_to_codepoint(const char *p, const char *e, rb_encoding *enc)
608{
609 const OnigUChar *up = RBIMPL_CAST((const OnigUChar *)p);
610 const OnigUChar *ue = RBIMPL_CAST((const OnigUChar *)e);
611
612 return ONIGENC_MBC_TO_CODE(enc, up, ue);
613}
614
624int rb_enc_codelen(int code, rb_encoding *enc);
625
634static inline int
636{
637 OnigCodePoint uc = RBIMPL_CAST((OnigCodePoint)c);
638
639 return ONIGENC_CODE_TO_MBCLEN(enc, uc);
640}
641
656static inline int
657rb_enc_mbcput(unsigned int c, void *buf, rb_encoding *enc)
658{
659 OnigCodePoint uc = RBIMPL_CAST((OnigCodePoint)c);
660 OnigUChar *ubuf = RBIMPL_CAST((OnigUChar *)buf);
661
662 return ONIGENC_CODE_TO_MBC(enc, uc, ubuf);
663}
664
675static inline char *
676rb_enc_prev_char(const char *s, const char *p, const char *e, rb_encoding *enc)
677{
678 const OnigUChar *us = RBIMPL_CAST((const OnigUChar *)s);
679 const OnigUChar *up = RBIMPL_CAST((const OnigUChar *)p);
680 const OnigUChar *ue = RBIMPL_CAST((const OnigUChar *)e);
681 OnigUChar *ur = onigenc_get_prev_char_head(enc, us, up, ue);
682
683 return RBIMPL_CAST((char *)ur);
684}
685
696static inline char *
697rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
698{
699 const OnigUChar *us = RBIMPL_CAST((const OnigUChar *)s);
700 const OnigUChar *up = RBIMPL_CAST((const OnigUChar *)p);
701 const OnigUChar *ue = RBIMPL_CAST((const OnigUChar *)e);
702 OnigUChar *ur = onigenc_get_left_adjust_char_head(enc, us, up, ue);
703
704 return RBIMPL_CAST((char *)ur);
705}
706
717static inline char *
718rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
719{
720 const OnigUChar *us = RBIMPL_CAST((const OnigUChar *)s);
721 const OnigUChar *up = RBIMPL_CAST((const OnigUChar *)p);
722 const OnigUChar *ue = RBIMPL_CAST((const OnigUChar *)e);
723 OnigUChar *ur = onigenc_get_right_adjust_char_head(enc, us, up, ue);
724
725 return RBIMPL_CAST((char *)ur);
726}
727
739static inline char *
740rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
741{
742 const OnigUChar *us = RBIMPL_CAST((const OnigUChar *)s);
743 const OnigUChar *up = RBIMPL_CAST((const OnigUChar *)p);
744 const OnigUChar *ue = RBIMPL_CAST((const OnigUChar *)e);
745 const OnigUChar *ur = onigenc_step_back(enc, us, up, ue, n);
746
747 return RBIMPL_CAST((char *)ur);
748}
749
760static inline int
761rb_enc_asciicompat_inline(rb_encoding *enc)
762{
763 return rb_enc_mbminlen(enc)==1 && !rb_enc_dummy_p(enc);
764}
765
781static inline bool
783{
784 if (rb_enc_mbminlen(enc) != 1) {
785 return false;
786 }
787 else if (rb_enc_dummy_p(enc)) {
788 return false;
789 }
790 else {
791 return true;
792 }
793}
794
802static inline bool
804{
805 rb_encoding *enc = rb_enc_get(str);
806
807 return rb_enc_asciicompat(enc);
808}
809
819
836
848
860
872
886
897
906
915
916#ifndef rb_ascii8bit_encindex
928int rb_ascii8bit_encindex(void);
929#endif
930
940static inline bool
942{
944}
945
946#ifndef rb_utf8_encindex
954int rb_utf8_encindex(void);
955#endif
956
957#ifndef rb_usascii_encindex
965int rb_usascii_encindex(void);
966#endif
967
974int rb_locale_encindex(void);
975
982int rb_filesystem_encindex(void);
983
991VALUE rb_enc_default_external(void);
992
1000VALUE rb_enc_default_internal(void);
1001
1011void rb_enc_set_default_external(VALUE encoding);
1012
1022void rb_enc_set_default_internal(VALUE encoding);
1023
1033VALUE rb_locale_charmap(VALUE klass);
1034
1036
1037
1038#define RB_ENCODING_GET RB_ENCODING_GET
1039#define RB_ENCODING_GET_INLINED RB_ENCODING_GET_INLINED
1040#define RB_ENCODING_IS_ASCII8BIT RB_ENCODING_IS_ASCII8BIT
1041#define RB_ENCODING_SET RB_ENCODING_SET
1042#define RB_ENCODING_SET_INLINED RB_ENCODING_SET_INLINED
1043#define rb_enc_asciicompat rb_enc_asciicompat
1044#define rb_enc_code_to_mbclen rb_enc_code_to_mbclen
1045#define rb_enc_codepoint rb_enc_codepoint
1046#define rb_enc_left_char_head rb_enc_left_char_head
1047#define rb_enc_mbc_to_codepoint rb_enc_mbc_to_codepoint
1048#define rb_enc_mbcput rb_enc_mbcput
1049#define rb_enc_mbmaxlen rb_enc_mbmaxlen
1050#define rb_enc_mbminlen rb_enc_mbminlen
1051#define rb_enc_name rb_enc_name
1052#define rb_enc_prev_char rb_enc_prev_char
1053#define rb_enc_right_char_head rb_enc_right_char_head
1054#define rb_enc_step_back rb_enc_step_back
1055#define rb_enc_str_asciicompat_p rb_enc_str_asciicompat_p
1058#endif /* RUBY_INTERNAL_ENCODING_ENCODING_H */
static void RB_ENC_CODERANGE_SET(VALUE obj, enum ruby_coderange_type cr)
Destructively modifies the passed object so that its (inline) code range is the passed one.
Definition: coderange.h:129
Defines RBIMPL_ATTR_CONST.
Defines RBIMPL_ATTR_DEPRECATED.
Tweaking visibility of C variables/functions.
#define RUBY_EXTERN
Declaration of externally visible global variables.
Definition: dllexport.h:47
#define RBIMPL_SYMBOL_EXPORT_END()
Counterpart of RBIMPL_SYMBOL_EXPORT_BEGIN.
Definition: dllexport.h:106
#define RBIMPL_SYMBOL_EXPORT_BEGIN()
Shortcut macro equivalent to RUBY_SYMBOL_EXPORT_BEGIN extern "C" {.
Definition: dllexport.h:97
Defines enum ruby_fl_type.
@ RUBY_FL_USHIFT
Number of bits in ruby_fl_type that are not open to users.
Definition: fl_type.h:167
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implenentation detail of RB_FL_TEST().
Definition: fl_type.h:507
static void RB_FL_SET_RAW(VALUE obj, VALUE flags)
This is an implenentation detail of RB_FL_SET().
Definition: fl_type.h:644
static void RB_FL_UNSET_RAW(VALUE obj, VALUE flags)
This is an implenentation detail of RB_FL_UNSET().
Definition: fl_type.h:704
VALUE rb_cEncoding
Encoding class.
Definition: encoding.c:57
int rb_enc_dummy_p(rb_encoding *enc)
Queries if the passed encoding is dummy.
Definition: encoding.c:203
int rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc)
Queries the number of bytes of the character at the passed pointer.
Definition: encoding.c:1234
int rb_enc_get_index(VALUE obj)
Queries the index of the encoding of the passed object, if any.
Definition: encoding.c:979
int rb_to_encoding_index(VALUE obj)
Obtains a encoding index from a wider range of objects (than rb_enc_find_index()).
Definition: encoding.c:267
int rb_filesystem_encindex(void)
Identical to rb_filesystem_encoding(), except it returns the encoding's index instead of the encoding...
Definition: encoding.c:1579
VALUE rb_enc_associate(VALUE obj, rb_encoding *enc)
Identical to rb_enc_associate(), except it takes an encoding itself instead of its index.
Definition: encoding.c:1066
rb_encoding * rb_utf8_encoding(void)
Queries the encoding that represents UTF-8.
Definition: encoding.c:1527
static void RB_ENCODING_SET_INLINED(VALUE obj, int encindex)
Destructively assigns the passed encoding to the passed object.
Definition: encoding.h:80
rb_encoding * rb_ascii8bit_encoding(void)
Queries the encoding that represents ASCII-8BIT a.k.a.
Definition: encoding.c:1515
static bool RB_ENCODING_IS_ASCII8BIT(VALUE obj)
Queries if the passed object is in ascii 8bit (== binary) encoding.
Definition: encoding.h:941
int rb_enc_codelen(int code, rb_encoding *enc)
Queries the number of bytes requested to represent the passed code point using the passed encoding.
Definition: encoding.c:1284
rb_encoding * rb_to_encoding(VALUE obj)
Identical to rb_find_encoding(), except it raises an exception instead of returning NULL.
Definition: encoding.c:329
const OnigEncodingType rb_encoding
The type of encoding.
Definition: encoding.h:116
rb_encoding * rb_filesystem_encoding(void)
Queries the "filesystem" encoding.
Definition: encoding.c:1592
static const char * rb_enc_name(rb_encoding *enc)
Queries the (canonical) name of the passed encoding.
Definition: encoding.h:433
rb_encoding * rb_default_internal_encoding(void)
Queries the "default internal" encoding.
Definition: encoding.c:1724
void rb_enc_copy(VALUE dst, VALUE src)
Destructively copies the encoding of the latter object to that of former one.
Definition: encoding.c:1192
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
Definition: encoding.h:697
int rb_utf8_encindex(void)
Identical to rb_utf8_encoding(), except it returns the encoding's index instead of the encoding itsel...
Definition: encoding.c:1533
int rb_enc_fast_mbclen(const char *p, const char *e, rb_encoding *enc)
Identical to rb_enc_mbclen() unless the character at p overruns e.
Definition: encoding.c:1216
static int RB_ENCODING_GET(VALUE obj)
Just another name of rb_enc_get_index.
Definition: encoding.h:211
rb_encoding * rb_enc_get(VALUE obj)
Identical to rb_enc_get_index(), except the return type.
Definition: encoding.c:1072
rb_encoding * rb_enc_from_index(int idx)
Identical to rb_find_encoding(), except it takes an encoding index instead of a Ruby object.
Definition: encoding.c:414
int rb_ascii8bit_encindex(void)
Identical to rb_ascii8bit_encoding(), except it returns the encoding's index instead of the encoding ...
Definition: encoding.c:1521
unsigned int rb_enc_codepoint_len(const char *p, const char *e, int *len, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition: encoding.c:1270
int rb_enc_unicode_p(rb_encoding *enc)
Queries if the passed encoding is either one of UTF-8/16/32.
Definition: encoding.c:689
static char * rb_enc_prev_char(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the previous (left) character.
Definition: encoding.h:676
int rb_enc_to_index(rb_encoding *enc)
Queries the index of the encoding.
Definition: encoding.c:197
void rb_enc_set_index(VALUE obj, int encindex)
Destructively assigns an encoding (via its index) to an object.
Definition: encoding.c:1030
VALUE rb_locale_charmap(VALUE klass)
Returns a platform-depended "charmap" of the current locale.
Definition: localeinit.c:91
void rb_enc_set_default_internal(VALUE encoding)
Destructively assigns the passed encoding as the default internal encoding.
Definition: encoding.c:1774
VALUE rb_enc_default_external(void)
Identical to rb_default_external_encoding(), except it returns the Ruby-level counterpart instance of...
Definition: encoding.c:1651
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
Definition: encoding.h:718
rb_encoding * rb_enc_find(const char *name)
Identical to rb_find_encoding(), except it takes a C's string instead of Ruby's.
Definition: encoding.c:918
VALUE rb_enc_from_encoding(rb_encoding *enc)
Queries the Ruby-level counterpart instance of rb_cEncoding that corresponds to the passed encoding.
Definition: encoding.c:188
rb_encoding * rb_find_encoding(VALUE obj)
Identical to rb_to_encoding_index(), except the return type.
Definition: encoding.c:336
static bool rb_enc_asciicompat(rb_encoding *enc)
Queries if the passed encoding is in some sense compatible with ASCII.
Definition: encoding.h:782
int rb_define_dummy_encoding(const char *name)
Creates a new "dummy" encoding.
Definition: encoding.c:617
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition: encoding.h:587
static int rb_enc_mbcput(unsigned int c, void *buf, rb_encoding *enc)
Identical to rb_enc_uint_chr(), except it writes back to the passed buffer instead of allocating one.
Definition: encoding.h:657
rb_encoding * rb_default_external_encoding(void)
Queries the "default external" encoding.
Definition: encoding.c:1637
int rb_locale_encindex(void)
Identical to rb_locale_encoding(), except it returns the encoding's index instead of the encoding its...
Definition: encoding.c:1553
int rb_char_to_option_kcode(int c, int *option, int *kcode)
Converts a character option to its encoding.
Definition: re.c:329
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
Definition: encoding.h:463
rb_encoding * rb_enc_check(VALUE str1, VALUE str2)
Identical to rb_enc_compatible(), except it raises an exception instead of returning NULL.
Definition: encoding.c:1097
int rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc)
Queries the number of bytes of the character at the passed pointer.
Definition: encoding.c:1222
static void RB_ENCODING_SET(VALUE obj, int encindex)
Just another name of rb_enc_set_index.
Definition: encoding.h:237
int rb_enc_capable(VALUE obj)
Queries if the passed object can have its encoding.
Definition: encoding.c:943
static void RB_ENCODING_CODERANGE_SET(VALUE obj, int encindex, enum ruby_coderange_type cr)
This is RB_ENCODING_SET + RB_ENC_CODERANGE_SET combo.
Definition: encoding.h:254
VALUE rb_enc_default_internal(void)
Identical to rb_default_internal_encoding(), except it returns the Ruby-level counterpart instance of...
Definition: encoding.c:1733
VALUE rb_enc_associate_index(VALUE obj, int encindex)
Identical to rb_enc_set_index(), except it additionally does contents fix-up depending on the passed ...
Definition: encoding.c:1038
rb_encoding * rb_enc_compatible(VALUE str1, VALUE str2)
Look for the "common" encoding between the two.
Definition: encoding.c:1176
int rb_enc_replicate(const char *name, rb_encoding *src)
Creates a new encoding, using the passed one as a template.
Definition: encoding.c:550
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
Definition: encoding.h:98
rb_encoding * rb_locale_encoding(void)
Queries the encoding that represents the current locale.
Definition: encoding.c:1573
static OnigCodePoint rb_enc_mbc_to_codepoint(const char *p, const char *e, rb_encoding *enc)
Identical to rb_enc_codepoint(), except it assumes the passed character is not broken.
Definition: encoding.h:607
rb_encoding * rb_usascii_encoding(void)
Queries the encoding that represents US-ASCII.
Definition: encoding.c:1539
void rb_enc_set_default_external(VALUE encoding)
Destructively assigns the passed encoding as the default external encoding.
Definition: encoding.c:1691
int rb_enc_find_index(const char *name)
Queries the index of the encoding.
Definition: encoding.c:881
static int rb_enc_mbminlen(rb_encoding *enc)
Queries the minimum number of bytes that the passed encoding needs to represent a character.
Definition: encoding.h:448
int rb_enc_alias(const char *alias, const char *orig)
Registers an "alias" name.
Definition: encoding.c:721
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
Definition: encoding.h:635
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
Definition: encoding.h:740
int rb_enc_ascget(const char *p, const char *e, int *len, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition: encoding.c:1246
static bool rb_enc_str_asciicompat_p(VALUE str)
Queries if the passed string is in an ASCII-compatible encoding.
Definition: encoding.h:803
int rb_usascii_encindex(void)
Identical to rb_usascii_encoding(), except it returns the encoding's index instead of the encoding it...
Definition: encoding.c:1545
RBIMPL_ATTR_CONST() int rb_io_oflags_fmode(int oflags)
Converts an oflags (that rb_io_modestr_oflags() returns) to a fmode (that rb_io_mode_flags() returns)...
RBIMPL_ATTR_PURE() int rb_io_read_pending(rb_io_t *fptr)
Queries if the passed IO has any pending reads.
Defines RBIMPL_ATTR_NOALIAS.
#define RBIMPL_ATTR_NOALIAS()
Wraps (or simulates) __declspec((noalias))
Definition: noalias.h:62
Defines RBIMPL_ATTR_PURE.
Defines struct RBasic.
Defines RBIMPL_ATTR_RETURNS_NONNULL.
#define RBIMPL_ATTR_RETURNS_NONNULL()
Wraps (or simulates) __attribute__((returns_nonnull))
Defines VALUE and ID.