5 files changed, 201 insertions, 1 deletions
diff --git a/array.c b/array.c
index 42aba64abe..02032668f3 100644
--- a/array.c
+++ b/array.c
@@ -23,6 +23,7 @@
 #include "internal/object.h"
 #include "internal/proc.h"
 #include "internal/rational.h"
+#include "internal/string.h"
 #include "internal/vm.h"
 #include "probes.h"
 #include "ruby/encoding.h"
@@ -2993,6 +2994,67 @@ ary_join_1(VALUE obj, VALUE ary, VALUE sep, long i, VALUE result, int *first)
     }
 }
 
+/* Fast path for Array#join: when every element is a String in one fast-path encoding
+ * (UTF-8 / US-ASCII / ASCII-8BIT) and the separator is byte-compatible, the result can
+ * be produced with a single memcpy pass instead of appending each element through
+ * rb_str_buf_append. Returns the joined String, or Qundef when any of those invariants
+ * does not hold -- the caller then uses the general path. No user code runs here, so
+ * the array cannot be mutated underneath us. */
+static VALUE
+ary_join_fast(VALUE ary, VALUE sep)
+{
+    long n = RARRAY_LEN(ary);
+    if (n == 0) return Qundef;
+
+    VALUE first = RARRAY_AREF(ary, 0);
+    if (!RB_TYPE_P(first, T_STRING)) return Qundef;
+    int encidx = ENCODING_GET(first);
+    if (!rb_str_encindex_fastpath(encidx)) return Qundef;
+
+    /* cr accumulates the result code range exactly as rb_str_buf_append would. */
+    enum ruby_coderange_type cr = ENC_CODERANGE_7BIT;
+    long sep_len = 0;
+    const char *sep_ptr = NULL;
+    if (!NIL_P(sep)) {
+        int sep_cr = rb_enc_str_coderange(sep);
+        /* The separator must share the element encoding, or be 7-bit (encidx is
+           ASCII-compatible, so a 7-bit separator concatenates without negotiation). */
+        if (ENCODING_GET(sep) != encidx && sep_cr != ENC_CODERANGE_7BIT) return Qundef;
+        sep_ptr = RSTRING_PTR(sep);
+        sep_len = RSTRING_LEN(sep);
+        if (n > 1) cr = ENC_CODERANGE_AND(cr, sep_cr);
+    }
+
+    /* One pass: confirm the shared encoding, measure the length, merge code ranges. */
+    long len = 1 + sep_len * (n - 1);
+    for (long i = 0; i < n; i++) {
+        VALUE s = RARRAY_AREF(ary, i);
+        if (!RB_TYPE_P(s, T_STRING) || ENCODING_GET(s) != encidx) return Qundef;
+        len += RSTRING_LEN(s);
+        cr = ENC_CODERANGE_AND(cr, rb_enc_str_coderange(s));
+    }
+
+    VALUE result = rb_str_buf_new(len);
+    rb_enc_associate_index(result, encidx);
+    char *const buf = RSTRING_PTR(result);
+    char *p = buf;
+    for (long i = 0; i < n; i++) {
+        VALUE s = RARRAY_AREF(ary, i);
+        long slen = RSTRING_LEN(s);
+        if (i > 0 && sep_len) {
+            memcpy(p, sep_ptr, sep_len);
+            p += sep_len;
+        }
+        memcpy(p, RSTRING_PTR(s), slen);
+        p += slen;
+    }
+
+    ENC_CODERANGE_CLEAR(result);  /* keep rb_str_set_len from rescanning the bytes */
+    rb_str_set_len(result, p - buf);
+    ENC_CODERANGE_SET(result, cr);
+    return result;
+}
+
 VALUE
 rb_ary_join(VALUE ary, VALUE sep)
 {
@@ -3001,8 +3063,12 @@ rb_ary_join(VALUE ary, VALUE sep)
 
     if (RARRAY_LEN(ary) == 0) return rb_usascii_str_new(0, 0);
 
+    if (!NIL_P(sep)) StringValue(sep);
+
+    result = ary_join_fast(ary, sep);
+    if (!UNDEF_P(result)) return result;
+
     if (!NIL_P(sep)) {
-        StringValue(sep);
         len += RSTRING_LEN(sep) * (RARRAY_LEN(ary) - 1);
     }
     long len_memo = RARRAY_LEN(ary);
diff --git a/benchmark/array_join.yml b/benchmark/array_join.yml
new file mode 100644
index 0000000000..65d718e9af
--- /dev/null
+++ b/benchmark/array_join.yml
@@ -0,0 +1,34 @@
+prelude: |
+  # All elements are 7-bit ASCII Strings (the dominant real-world join case).
+  # Distinct objects so large-N cases exercise realistic heap/cache behavior.
+  a100_1    = Array.new(100)    { "a" * 1  }
+  a100_8    = Array.new(100)    { "a" * 8  }
+  a100_40   = Array.new(100)    { "a" * 40 }
+  a1k_1     = Array.new(1000)   { "a" * 1  }
+  a1k_8     = Array.new(1000)   { "a" * 8  }
+  a1k_40    = Array.new(1000)   { "a" * 40 }
+  a100k_1   = Array.new(100000) { "a" * 1  }
+  a100k_8   = Array.new(100000) { "a" * 8  }
+  a100k_40  = Array.new(100000) { "a" * 40 }
+
+benchmark:
+  # separator " " (space)
+  join_sp_n100_e1:    a100_1.join(" ")
+  join_sp_n100_e8:    a100_8.join(" ")
+  join_sp_n100_e40:   a100_40.join(" ")
+  join_sp_n1k_e1:     a1k_1.join(" ")
+  join_sp_n1k_e8:     a1k_8.join(" ")
+  join_sp_n1k_e40:    a1k_40.join(" ")
+  join_sp_n100k_e1:   a100k_1.join(" ")
+  join_sp_n100k_e8:   a100k_8.join(" ")
+  join_sp_n100k_e40:  a100k_40.join(" ")
+  # separator "\n" (newline)
+  join_nl_n100_e1:    a100_1.join("\n")
+  join_nl_n100_e8:    a100_8.join("\n")
+  join_nl_n100_e40:   a100_40.join("\n")
+  join_nl_n1k_e1:     a1k_1.join("\n")
+  join_nl_n1k_e8:     a1k_8.join("\n")
+  join_nl_n1k_e40:    a1k_40.join("\n")
+  join_nl_n100k_e1:   a100k_1.join("\n")
+  join_nl_n100k_e8:   a100k_8.join("\n")
+  join_nl_n100k_e40:  a100k_40.join("\n")
diff --git a/benchmark/array_join_regression.yml b/benchmark/array_join_regression.yml
new file mode 100644
index 0000000000..5cbfa6867f
--- /dev/null
+++ b/benchmark/array_join_regression.yml
@@ -0,0 +1,44 @@
+prelude: |
+  # --- must NOT take any fast path: prove added precompute-loop checks don't regress ---
+  # all multibyte UTF-8 (VALID coderange). 7-bit gate bails at elem 0; same-encoding gate TAKES it.
+  mb_utf8       = Array.new(1000) { "café" }
+  # ASCII except a trailing multibyte elem: WORST CASE for 7-bit gate (scans all, then falls back).
+  tail_mb       = Array.new(1000) { "a" * 8 }; tail_mb[-1] = "café"
+  # non-String elements -> ary_join_1 (to_s). Gate bails at type check before coderange.
+  nonstr_int    = Array.new(1000) { |i| i }
+  # ASCII strings except a trailing Integer: precompute scans strings, then mixed fallback.
+  mixed_tail    = (Array.new(999) { "a" * 8 } << 42)
+  # UTF-16LE: not ASCII-compatible. Both gates bail immediately.
+  utf16         = Array.new(1000) { "ab".encode("UTF-16LE") }
+  # nested arrays -> recursive fallback.
+  nested        = Array.new(500)  { [1, 2] }
+
+  # 7-bit ASCII elements but a multibyte separator (3-byte em dash). Both gates bail at sep check.
+  ascii_elems   = Array.new(1000) { "a" * 8 }
+  sep_mb        = "—"
+
+  # --- fast-path ELIGIBLE variants: confirm benefit generalizes beyond single chars / find crossover ---
+  # frozen ASCII elements (read-only path).
+  frozen_elems  = Array.new(1000) { ("a" * 8).freeze }
+  # large elements: where memcpy should dominate and the win decays toward 1x.
+  big_e256      = Array.new(1000) { "a" * 256  }
+  big_e1k       = Array.new(1000) { "a" * 1024 }
+  big_e4k       = Array.new(1000) { "a" * 4096 }
+
+benchmark:
+  # regression (fallback / gate worst-case) — all use a 1-byte ASCII separator unless noted
+  reg_multibyte_utf8:   mb_utf8.join(" ")
+  reg_tail_multibyte:   tail_mb.join(" ")
+  reg_nonstring_int:    nonstr_int.join(" ")
+  reg_mixed_tail_int:   mixed_tail.join(" ")
+  reg_utf16:            utf16.join(" ".encode("UTF-16LE"))
+  reg_nested:           nested.join(" ")
+  reg_multibyte_sep:    ascii_elems.join(sep_mb)
+
+  # fast-path eligible variants
+  fp_frozen:            frozen_elems.join(" ")
+  fp_nosep:             ascii_elems.join
+  fp_multichar_sep:     ascii_elems.join(", ")
+  fp_big_e256:          big_e256.join(" ")
+  fp_big_e1k:           big_e1k.join(" ")
+  fp_big_e4k:           big_e4k.join(" ")
diff --git a/depend b/depend
index a2e8312298..7ae13bd63d 100644
--- a/depend
+++ b/depend
@@ -80,6 +80,7 @@ array.$(OBJEXT): $(top_srcdir)/internal/sanitizers.h
 array.$(OBJEXT): $(top_srcdir)/internal/serial.h
 array.$(OBJEXT): $(top_srcdir)/internal/set_table.h
 array.$(OBJEXT): $(top_srcdir)/internal/static_assert.h
+array.$(OBJEXT): $(top_srcdir)/internal/string.h
 array.$(OBJEXT): $(top_srcdir)/internal/struct.h
 array.$(OBJEXT): $(top_srcdir)/internal/variable.h
 array.$(OBJEXT): $(top_srcdir)/internal/vm.h
@@ -102,6 +103,7 @@ array.$(OBJEXT): {$(VPATH)}config.h
 array.$(OBJEXT): {$(VPATH)}constant.h
 array.$(OBJEXT): {$(VPATH)}debug_counter.h
 array.$(OBJEXT): {$(VPATH)}defines.h
+array.$(OBJEXT): {$(VPATH)}encindex.h
 array.$(OBJEXT): {$(VPATH)}encoding.h
 array.$(OBJEXT): {$(VPATH)}id.h
 array.$(OBJEXT): {$(VPATH)}id_table.h
diff --git a/test/ruby/test_array.rb b/test/ruby/test_array.rb
index 76455187a5..36f8801614 100644
--- a/test/ruby/test_array.rb
+++ b/test/ruby/test_array.rb
@@ -2812,6 +2812,60 @@ class TestArray < Test::Unit::TestCase
     assert_equal("b", x.ary.join(""))
   end
 
+  def test_join_encoding
+    # Multibyte UTF-8 elements: result is UTF-8, valid, not ASCII-only.
+    r = @cls["caf\u00e9", "na\u00efve"].join(" ")
+    assert_equal("caf\u00e9 na\u00efve", r)
+    assert_equal(Encoding::UTF_8, r.encoding)
+    assert_equal(true, r.valid_encoding?)
+    assert_equal(false, r.ascii_only?)
+
+    # All 7-bit content stays ASCII-only.
+    r = @cls["abc", "def"].join(",")
+    assert_equal("abc,def", r)
+    assert_equal(true, r.ascii_only?)
+
+    # Multibyte separator (same encoding as the elements).
+    r = @cls["a", "b", "c"].join("\u2014")
+    assert_equal("a\u2014b\u2014c", r)
+    assert_equal(Encoding::UTF_8, r.encoding)
+    assert_equal(false, r.ascii_only?)
+    assert_equal(true, r.valid_encoding?)
+
+    # Mixed ASCII-compatible encodings, all 7-bit: result takes element 0's encoding.
+    r = @cls["abc".dup.force_encoding("US-ASCII"), "def".dup.force_encoding("UTF-8")].join(" ")
+    assert_equal("abc def", r)
+    assert_equal(Encoding::US_ASCII, r.encoding)
+    assert_equal(true, r.ascii_only?)
+
+    # 7-bit content in a non-ASCII encoding (Shift_JIS).
+    r = @cls["abc".encode("Shift_JIS"), "def".encode("Shift_JIS")].join(" ")
+    assert_equal(Encoding::Shift_JIS, r.encoding)
+    assert_equal("abc def".b, r.b)
+
+    # Same-encoding multibyte (Shift_JIS): byte-for-byte concatenation.
+    s = "\u65e5\u672c".encode("Shift_JIS")
+    sep = "/".encode("Shift_JIS")
+    r = @cls[s, s].join(sep)
+    assert_equal(Encoding::Shift_JIS, r.encoding)
+    assert_equal((s + sep + s).b, r.b)
+
+    # Broken bytes: result keeps them and reports invalid.
+    bad = "\xff\xfe".dup.force_encoding("UTF-8")
+    r = @cls[bad, bad].join(" ")
+    assert_equal((bad + " " + bad).b, r.b)
+    assert_equal(false, r.valid_encoding?)
+
+    # Incompatible element/separator encodings still raise.
+    assert_raise(Encoding::CompatibilityError) do
+      @cls["a".dup.force_encoding("UTF-8"), "b".encode("UTF-16LE")].join(" ")
+    end
+
+    # Bulk copy: large array, verified against a non-join oracle.
+    big = @cls.new(500) { "ab" }
+    assert_equal(("ab," * 500).chomp(","), big.join(","))
+  end
+
   def test_to_a2
     klass = Class.new(Array)
     a = klass.new.to_a