diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index a121a18f9a09bc0670a6e71ad7eb2b442eb85bf8..e7c459dceb3a9d5aa05501278d85f9cc7ac0eeab 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3223,6 +3223,7 @@ (define_expand "vec_unpacks_hi_<mode>"
     DONE;
   }
 )
+
 (define_insn "extend<mode><Vwide>2"
   [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
 	(float_extend:<VWIDE>
@@ -3232,6 +3233,29 @@ (define_insn "extend<mode><Vwide>2"
   [(set_attr "type" "neon_fp_cvt_widen_s")]
 )
 
+/* A BF->SF is a shift left of 16, however shifts are expensive and the generic
+   middle-end expansion would force through DI move.  Instead use EXT to do the
+   shift to get better throughput and don't go through GPRs.  */
+
+(define_expand "extendbfsf2"
+  [(set (match_operand:SF 0 "register_operand" "=w")
+	(float_extend:SF
+	  (match_operand:BF 1 "register_operand" "w")))]
+  "TARGET_SIMD"
+{
+  rtx tmp0 = aarch64_gen_shareable_zero (V8BFmode);
+  rtx op0 = force_lowpart_subreg (V8BFmode, operands[1], BFmode);
+  rtx res = gen_reg_rtx (V8BFmode);
+  emit_insn (gen_aarch64_extv8bf (res, tmp0, op0, gen_int_mode (7, SImode)));
+  /* Subregs between floating point modes aren't allowed to change size, so go
+     through V4SFmode.  */
+  res = force_lowpart_subreg (V4SFmode, res, V8BFmode);
+  res = force_lowpart_subreg (SFmode, res, V4SFmode);
+  emit_move_insn (operands[0], res);
+  DONE;
+})
+
+
 ;; Float narrowing operations.
 
 (define_insn "aarch64_float_trunc_rodd_df"
diff --git a/gcc/testsuite/gcc.target/aarch64/pr121853_1.c b/gcc/testsuite/gcc.target/aarch64/pr121853_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..24b2fdc8cea1d0610812665e8faafa34f57b90b2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr121853_1.c
@@ -0,0 +1,64 @@
+/* { dg-do run } */
+/* { dg-additional-options "-O2 -std=c99" } */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+__attribute__ ((noipa))
+float convert(__bf16 value) {
+    return (float)value;
+}
+
+static inline uint32_t f32_bits(float f) {
+    uint32_t u; memcpy(&u, &f, sizeof u); return u;
+}
+static inline __bf16 bf16_from_bits(uint16_t u) {
+    __bf16 b;  memcpy(&b, &u, sizeof b); return b;
+}
+
+/* Fixed bf16 inputs (as raw 16-bit payloads) covering edge cases.  */
+static const uint16_t inputs[] = {
+    0x0000, // +0
+    0x8000, // -0
+    0x7F80, // +inf
+    0xFF80, // -inf
+    0x7FC0, // qNaN (+)  (quiet bit set in bf16)
+    0xFFC0, // qNaN (-)
+    0x7F01, // sNaN (+)  (will be quieted by conversion)
+    0xFF01, // sNaN (-)
+    0x0001, // smallest +subnormal
+    0x007F, // largest  +subnormal
+    0x8001, // smallest -subnormal
+    0x807F, // largest  -subnormal
+    0x0080, // smallest +normal
+    0x3F80, // +1.0
+    0xBF80, // -1.0
+    0x3F00, // +0.5
+    0xBF00, // -0.5
+    0x3FC0, // +1.5
+    0x7F7F, // max finite +
+    0xFF7F, // max finite -
+};
+
+int main(void) {
+    const size_t N = sizeof(inputs)/sizeof(inputs[0]);
+    size_t fails = 0;
+
+    for (size_t i = 0; i < N; ++i) {
+        __bf16 in = bf16_from_bits(inputs[i]);
+        float out = convert(in);
+        uint32_t got = f32_bits(out);
+        uint32_t exp = inputs[i] << 16;
+
+        if (got != exp) {
+            printf("FAIL[%zu]: in_bf16=0x%04X  exp_f32=0x%08X  got_f32=0x%08X\n",
+                   i, inputs[i], exp, got);
+            ++fails;
+        }
+    }
+
+    if (fails != 0)
+      __builtin_abort ();
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/pr121853_2.c b/gcc/testsuite/gcc.target/aarch64/pr121853_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..e9fb4016bcb7cb92b4d132da5f6fc07eeebd248a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr121853_2.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O1" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+float convert(__bf16 value) {
+    return (float)value;
+}
+
+/*
+** convert:
+**	movi	v[0-9]+.4s, 0
+**	ext	v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, #14
+**	ret
+*/
