From 2482e34ac12910f7e7436a968caf97ab96f8b582 Mon Sep 17 00:00:00 2001 From: Kerin Millar Date: Sat, 26 Apr 2025 08:43:58 +0100 Subject: [PATCH 1/3] Backport fix for invalid continuation bytes being ignored as delimiters This is a partial backport of commit 772e7e760e8a098e4d8dee21cf11090be4757918 from the devel branch. It addresses an issue in read_mbchar() whereby the read builtin can read past the delimiter character, provided that is invoked with a multibyte character set in effect. Consider the following test case. $ LC_ALL=en_US.UTF-8 $ for i in {194..245}; do printf -v o %o "$i"; printf "\\$o\\n"; done | while read -r; do declare -p REPLY; done declare -- REPLY=$'\302\n\303\n\304\n\305\n\306\n\307\n\310\n\311\n\312\ n\313\n\314\n\315\n\316\n\317\n\320\n\321\n\322\n\323\n\324\n\325\n\326\ n\327\n\330\n\331\n\332\n\333\n\334\n\335\n\336\n\337\n\340\n\341\n\342\ n\343\n\344\n\345\n\346\n\347\n\350\n\351\n\352\n\353\n\354\n\355\n\356\ n\357\n\360\n\361\n\362\n\363\n\364\n\365' The producing loop emits a sequence of bytes in the range 0xC2 - 0xF5. Since each is terminated by a character, one would expect for exactly 52 iterations of the consuming loop, with REPLY being assigned a single byte each time. Instead, the input is read in its entirety. Why is that, one may ask. Given a legal UTF-8 byte sequence, any bytes whose values are between 0xC2 - 0xF4 are combinative in nature; they can only be followed by between one and three bytes that are outside of that range. 0xC2 - 0xDF : First byte of a 2-byte code unit sequence 0xE0 - 0xEF : First byte of a 3-byte code unit sequence 0xF0 - 0xF4 : First byte of a 4-byte code unit sequence As such, bash begins by reading the 0xC2 byte, for which mbrtowc(3) returns -2, indicating an incomplete multibyte sequence. Next, the 0x0A byte is read, for which mbrtowc(3) returns -1, indicating an invalid multibyte sequence. At this point, bash ought to recognise the most recently read byte as a delimiter. Instead, it continues reading the input stream up until the delimiter that follows 0xF5, which is neither a combining character nor legal in UTF-8 in any capacity. This patch addresses the issue by introducing the zungetc() function, which is used by read_mbchar() to push back the delimiter character that transforms the sequence from an incomplete one to an invalid one. Said character is then detected by the next invocation of the zread() function, allowing for the decision to be made to return. With this, the output of the test case amounts to 52 lines, as expected. declare -- REPLY=$'\302' declare -- REPLY=$'\303' ... declare -- REPLY=$'\364' declare -- REPLY=$'\365' The issue affects all bash releases from 5.0 to 5.3-alpha. As of the time of writing, it has not been addressed by any of the official patchlevels, nor has 5.3 been released. Link: https://pubs.opengroup.org/onlinepubs/9799919799.2024edition/utilities/read.html#tag_20_100_06 Link: https://mywiki.wooledge.org/BashPitfalls#IFS.3D_read_-r_-d_.27.27_filename Link: https://lists.gnu.org/archive/html/bug-bash/2025-04/msg00068.html Signed-off-by: Kerin Millar --- builtins/read.def | 25 ++++++++++++---- externs.h | 1 + lib/sh/zread.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 94 insertions(+), 6 deletions(-) diff --git builtins/read.def builtins/read.def index ddd91d32..53b4bd81 100644 --- builtins/read.def +++ builtins/read.def @@ -130,7 +130,7 @@ static void set_readline_timeout PARAMS((sh_timer *t, time_t, long)); #endif static SHELL_VAR *bind_read_variable PARAMS((char *, char *, int)); #if defined (HANDLE_MULTIBYTE) -static int read_mbchar PARAMS((int, char *, int, int, int)); +static int read_mbchar PARAMS((int, char *, int, int, int, int)); #endif static void ttyrestore PARAMS((struct ttsave *)); @@ -806,7 +806,7 @@ add_char: else # endif if (locale_utf8locale == 0 || ((c & 0x80) != 0)) - i += read_mbchar (fd, input_string, i, c, unbuffered_read); + i += read_mbchar (fd, input_string, i, c, delim, unbuffered_read); } #endif @@ -1064,10 +1064,10 @@ bind_read_variable (name, value, flags) #if defined (HANDLE_MULTIBYTE) static int -read_mbchar (fd, string, ind, ch, unbuffered) +read_mbchar (fd, string, ind, ch, delim, unbuffered) int fd; char *string; - int ind, ch, unbuffered; + int ind, ch, delim, unbuffered; { char mbchar[MB_LEN_MAX + 1]; int i, n, r; @@ -1101,8 +1101,21 @@ read_mbchar (fd, string, ind, ch, unbuffered) mbchar[i++] = c; continue; } - else if (ret == (size_t)-1 || ret == (size_t)0 || ret > (size_t)0) - break; + else if (ret == (size_t)-1) + { + /* If we read a delimiter character that makes this an invalid + multibyte character, we can't just add it to the input string + and treat it as a byte. We need to push it back so a subsequent + zread will pick it up. */ + if (c == delim) + { + zungetc (c); + mbchar[--i] = '\0'; /* unget the delimiter */ + } + break; /* invalid multibyte character */ + } + else if (ret == (size_t)0 || ret > (size_t)0) + break; /* valid multibyte character */ } mbchar_return: diff --git externs.h externs.h index 931dba9c..1b70a13b 100644 --- externs.h +++ externs.h @@ -536,6 +536,7 @@ extern ssize_t zreadintr PARAMS((int, char *, size_t)); extern ssize_t zreadc PARAMS((int, char *)); extern ssize_t zreadcintr PARAMS((int, char *)); extern ssize_t zreadn PARAMS((int, char *, size_t)); +extern int zungetc PARAMS((int)); extern void zreset PARAMS((void)); extern void zsyncfd PARAMS((int)); diff --git lib/sh/zread.c lib/sh/zread.c index dafb7f60..7cfbb288 100644 --- lib/sh/zread.c +++ lib/sh/zread.c @@ -41,6 +41,10 @@ extern int errno; # define ZBUFSIZ 4096 #endif +#ifndef EOF +# define EOF -1 +#endif + extern int executing_builtin; extern void check_signals_and_traps (void); @@ -48,6 +52,11 @@ extern void check_signals (void); extern int signal_is_trapped (int); extern int read_builtin_timeout (int); +int zungetc (int); + +/* Provide one character of pushback whether we are using read or zread. */ +static int zpushedchar = -1; + /* Read LEN bytes from FD into BUF. Retry the read on EINTR. Any other error causes the loop to break. */ ssize_t @@ -59,6 +68,15 @@ zread (fd, buf, len) ssize_t r; check_signals (); /* check for signals before a blocking read */ + + /* If we pushed a char back, return it immediately */ + if (zpushedchar != -1) + { + *buf = (unsigned char)zpushedchar; + zpushedchar = -1; + return 1; + } + /* should generalize into a mechanism where different parts of the shell can `register' timeouts and have them checked here. */ while (((r = read_builtin_timeout (fd)) < 0 || (r = read (fd, buf, len)) < 0) && @@ -95,6 +113,14 @@ zreadretry (fd, buf, len) ssize_t r; int nintr; + /* If we pushed a char back, return it immediately */ + if (zpushedchar != -1) + { + *buf = (unsigned char)zpushedchar; + zpushedchar = -1; + return 1; + } + for (nintr = 0; ; ) { r = read (fd, buf, len); @@ -118,6 +144,15 @@ zreadintr (fd, buf, len) size_t len; { check_signals (); + + /* If we pushed a char back, return it immediately */ + if (zpushedchar != -1) + { + *buf = (unsigned char)zpushedchar; + zpushedchar = -1; + return 1; + } + return (read (fd, buf, len)); } @@ -135,6 +170,14 @@ zreadc (fd, cp) { ssize_t nr; + /* If we pushed a char back, return it immediately */ + if (zpushedchar != -1 && cp) + { + *cp = (unsigned char)zpushedchar; + zpushedchar = -1; + return 1; + } + if (lind == lused || lused == 0) { nr = zread (fd, lbuf, sizeof (lbuf)); @@ -160,6 +203,14 @@ zreadcintr (fd, cp) { ssize_t nr; + /* If we pushed a char back, return it immediately */ + if (zpushedchar != -1 && cp) + { + *cp = (unsigned char)zpushedchar; + zpushedchar = -1; + return 1; + } + if (lind == lused || lused == 0) { nr = zreadintr (fd, lbuf, sizeof (lbuf)); @@ -186,6 +237,13 @@ zreadn (fd, cp, len) { ssize_t nr; + if (zpushedchar != -1 && cp) + { + *cp = zpushedchar; + zpushedchar = -1; + return 1; + } + if (lind == lused || lused == 0) { if (len > sizeof (lbuf)) @@ -204,6 +262,22 @@ zreadn (fd, cp, len) return 1; } +int +zungetc (c) + int c; +{ + if (zpushedchar == -1) + { + zpushedchar = c; + return c; + } + + if (c == EOF || lind == 0) + return (EOF); + lbuf[--lind] = c; /* XXX */ + return c; +} + void zreset () { -- 2.49.0