Thanks for the comment. I have updated the patch to v3. Please have a look.
On 2023/8/7 19:01, John Naylor wrote:
On Fri, Jun 16, 2023 at 8:28 AM YANG Xudong <yangxud...@ymatrix.cn
<mailto:yangxud...@ymatrix.cn>> wrote:
> > +# If the intrinsics are supported, sets
pgac_loongarch_crc32c_intrinsics,
> > +# and CFLAGS_CRC.
> >
> > +# Check if __builtin_loongarch_crcc_* intrinsics can be used
> > +# with the default compiler flags.
> > +# CFLAGS_CRC is set if the extra flag is required.
> >
> > Same here -- it seems we don't need to set CFLAGS_CRC at all. Can you
> > confirm?
> >
>
> We don't need to set CFLAGS_CRC as commented. I have updated the
> configure script to make it align with the logic in meson build script.
(Looking again at v2)
The compilation test is found in c-compiler.m4, which still has all
logic for CFLAGS_CRC, including saving and restoring the old CFLAGS. Can
this also be simplified?
Fixed the function in c-compiler.m4 by removing the function argument
and the logic of handling CFLAGS and CFLAGS_CRC.
I diff'd pg_crc32c_loongarch.c with the current other files, and found
it is structurally the same as the Arm implementation. That's logical if
memory alignment is important.
/*
- * ARMv8 doesn't require alignment, but aligned memory access is
- * significantly faster. Process leading bytes so that the loop below
- * starts with a pointer aligned to eight bytes.
+ * Aligned memory access is significantly faster.
+ * Process leading bytes so that the loop below starts with a pointer
aligned to eight bytes.
Can you confirm the alignment requirement -- it's not clear what the
intention is since "doesn't require" wasn't carried over. Is there any
documentation (or even a report in some other context) about aligned vs
unaligned memory access performance?
It is in the official document that the alignment is not required.
https://github.com/loongson/la-softdev-convention/blob/master/la-softdev-convention.adoc#74-unaligned-memory-access-support
However, I found this patch in LKML that shows great performance gain
when using aligned memory access similar to this patch.
https://lore.kernel.org/lkml/20230410115734.93365-1-wang...@loongson.cn/
So I guess using aligned memory access is necessary and I have updated
the comment in the code.
--
John Naylor
EDB: http://www.enterprisedb.com <http://www.enterprisedb.com>
From ced3f65d7445bdcca0628c4f5073b5657a81cd28 Mon Sep 17 00:00:00 2001
From: YANG Xudong <yangxud...@ymatrix.cn>
Date: Tue, 8 Aug 2023 10:41:58 +0800
Subject: [PATCH v3] Add loongarch native checksum implementation.
---
config/c-compiler.m4 | 29 ++++++++++++++
configure | 69 ++++++++++++++++++++++++++++----
configure.ac | 33 +++++++++++----
meson.build | 24 +++++++++++
src/include/pg_config.h.in | 3 ++
src/include/port/pg_crc32c.h | 9 +++++
src/port/meson.build | 3 ++
src/port/pg_crc32c_loongarch.c | 73 ++++++++++++++++++++++++++++++++++
8 files changed, 228 insertions(+), 15 deletions(-)
create mode 100644 src/port/pg_crc32c_loongarch.c
diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 5be8f0f08d..7777ad6e90 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -661,3 +661,32 @@ if test x"$Ac_cachevar" = x"yes"; then
fi
undefine([Ac_cachevar])dnl
])# PGAC_ARMV8_CRC32C_INTRINSICS
+
+# PGAC_LOONGARCH_CRC32C_INTRINSICS
+# ---------------------------
+# Check if the compiler supports the LoongArch CRCC instructions, using
+# __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w,
+# __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w
+# intrinsic functions.
+#
+# If the intrinsics are supported, sets pgac_loongarch_crc32c_intrinsics.
+AC_DEFUN([PGAC_LOONGARCH_CRC32C_INTRINSICS],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_loongarch_crc32c_intrinsics])])dnl
+AC_CACHE_CHECK(
+ [for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w,
__builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w],
+ [Ac_cachevar],
+[AC_LINK_IFELSE([AC_LANG_PROGRAM([],
+ [unsigned int crc = 0;
+ crc = __builtin_loongarch_crcc_w_b_w(0, crc);
+ crc = __builtin_loongarch_crcc_w_h_w(0, crc);
+ crc = __builtin_loongarch_crcc_w_w_w(0, crc);
+ crc = __builtin_loongarch_crcc_w_d_w(0, crc);
+ /* return computed value, to prevent the above being optimized away */
+ return crc == 0;])],
+ [Ac_cachevar=yes],
+ [Ac_cachevar=no])])
+if test x"$Ac_cachevar" = x"yes"; then
+ pgac_loongarch_crc32c_intrinsics=yes
+fi
+undefine([Ac_cachevar])dnl
+])# PGAC_LOONGARCH_CRC32C_INTRINSICS
diff --git a/configure b/configure
index 963fbbcf1e..c5eb2814f1 100755
--- a/configure
+++ b/configure
@@ -18047,6 +18047,47 @@ fi
fi
+# Check for LoongArch CRC intrinsics to do CRC calculations.
+#
+# Check if __builtin_loongarch_crcc_* intrinsics can be used
+# with the default compiler flags.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for
__builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w,
__builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w" >&5
+$as_echo_n "checking for __builtin_loongarch_crcc_w_b_w,
__builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and
__builtin_loongarch_crcc_w_d_w... " >&6; }
+if ${pgac_cv_loongarch_crc32c_intrinsics+:} false; then :
+ $as_echo_n "(cached) " >&6
+else
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+
+int
+main ()
+{
+unsigned int crc = 0;
+ crc = __builtin_loongarch_crcc_w_b_w(0, crc);
+ crc = __builtin_loongarch_crcc_w_h_w(0, crc);
+ crc = __builtin_loongarch_crcc_w_w_w(0, crc);
+ crc = __builtin_loongarch_crcc_w_d_w(0, crc);
+ /* return computed value, to prevent the above being optimized away */
+ return crc == 0;
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+ pgac_cv_loongarch_crc32c_intrinsics=yes
+else
+ pgac_cv_loongarch_crc32c_intrinsics=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+ conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result:
$pgac_cv_loongarch_crc32c_intrinsics" >&5
+$as_echo "$pgac_cv_loongarch_crc32c_intrinsics" >&6; }
+if test x"$pgac_cv_loongarch_crc32c_intrinsics" = x"yes"; then
+ pgac_loongarch_crc32c_intrinsics=yes
+fi
+
+
# Select CRC-32C implementation.
@@ -18065,7 +18106,7 @@ fi
#
# You can override this logic by setting the appropriate USE_*_CRC32 flag to 1
# in the template or configure command line.
-if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x""
&& test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test
x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" =
x""; then
+if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x""
&& test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test
x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" =
x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then
# Use Intel SSE 4.2 if available.
if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED"
= x"1" ; then
USE_SSE42_CRC32C=1
@@ -18083,10 +18124,15 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test
x"$USE_SSE42_CRC32C" = x"" &&
if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then
USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1
else
- # fall back to slicing-by-8 algorithm, which doesn't require any
- # special CPU support.
- USE_SLICING_BY_8_CRC32C=1
- fi
+ if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then
+ # LoongArch CRCC instructions.
+ USE_LOONGARCH_CRC32C=1
+ else
+ # fall back to slicing-by-8 algorithm, which doesn't require any
+ # special CPU support.
+ USE_SLICING_BY_8_CRC32C=1
+ fi
+ fi
fi
fi
fi
@@ -18127,12 +18173,21 @@ $as_echo "#define USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK
1" >>confdefs.h
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC
instructions with runtime check" >&5
$as_echo "ARMv8 CRC instructions with runtime check" >&6; }
else
+ if test x"$USE_LOONGARCH_CRC32C" = x"1"; then
+
+$as_echo "#define USE_LOONGARCH_CRC32C 1" >>confdefs.h
+
+ PG_CRC32C_OBJS="pg_crc32c_loongarch.o"
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: LoongArch CRCC
instructions" >&5
+$as_echo "LoongArch CRCC instructions" >&6; }
+ else
$as_echo "#define USE_SLICING_BY_8_CRC32C 1" >>confdefs.h
- PG_CRC32C_OBJS="pg_crc32c_sb8.o"
- { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5
+ PG_CRC32C_OBJS="pg_crc32c_sb8.o"
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5
$as_echo "slicing-by-8" >&6; }
+ fi
fi
fi
fi
diff --git a/configure.ac b/configure.ac
index 5153b8b3fd..3011b8ea34 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2099,6 +2099,12 @@ if test x"$pgac_armv8_crc32c_intrinsics" != x"yes"; then
PGAC_ARMV8_CRC32C_INTRINSICS([-march=armv8-a+crc])
fi
+# Check for LoongArch CRC intrinsics to do CRC calculations.
+#
+# Check if __builtin_loongarch_crcc_* intrinsics can be used
+# with the default compiler flags.
+PGAC_LOONGARCH_CRC32C_INTRINSICS()
+
AC_SUBST(CFLAGS_CRC)
# Select CRC-32C implementation.
@@ -2117,7 +2123,7 @@ AC_SUBST(CFLAGS_CRC)
#
# You can override this logic by setting the appropriate USE_*_CRC32 flag to 1
# in the template or configure command line.
-if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x""
&& test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test
x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" =
x""; then
+if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x""
&& test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test
x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" =
x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then
# Use Intel SSE 4.2 if available.
if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED"
= x"1" ; then
USE_SSE42_CRC32C=1
@@ -2135,10 +2141,15 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test
x"$USE_SSE42_CRC32C" = x"" &&
if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then
USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1
else
- # fall back to slicing-by-8 algorithm, which doesn't require any
- # special CPU support.
- USE_SLICING_BY_8_CRC32C=1
- fi
+ if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then
+ # LoongArch CRCC instructions.
+ USE_LOONGARCH_CRC32C=1
+ else
+ # fall back to slicing-by-8 algorithm, which doesn't require any
+ # special CPU support.
+ USE_SLICING_BY_8_CRC32C=1
+ fi
+ fi
fi
fi
fi
@@ -2166,9 +2177,15 @@ else
PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o
pg_crc32c_armv8_choose.o"
AC_MSG_RESULT(ARMv8 CRC instructions with runtime check)
else
- AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software
CRC-32C implementation (slicing-by-8).])
- PG_CRC32C_OBJS="pg_crc32c_sb8.o"
- AC_MSG_RESULT(slicing-by-8)
+ if test x"$USE_LOONGARCH_CRC32C" = x"1"; then
+ AC_DEFINE(USE_LOONGARCH_CRC32C, 1, [Define to 1 to use LoongArch
CRCC instructions.])
+ PG_CRC32C_OBJS="pg_crc32c_loongarch.o"
+ AC_MSG_RESULT(LoongArch CRCC instructions)
+ else
+ AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software
CRC-32C implementation (slicing-by-8).])
+ PG_CRC32C_OBJS="pg_crc32c_sb8.o"
+ AC_MSG_RESULT(slicing-by-8)
+ fi
fi
fi
fi
diff --git a/meson.build b/meson.build
index 0a11efc97a..2acb204003 100644
--- a/meson.build
+++ b/meson.build
@@ -2065,6 +2065,30 @@ int main(void)
cdata.set('USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 1)
have_optimized_crc = true
endif
+
+elif host_cpu == 'loongarch64'
+
+ prog = '''
+int main(void)
+{
+ unsigned int crc = 0;
+ crc = __builtin_loongarch_crcc_w_b_w(0, crc);
+ crc = __builtin_loongarch_crcc_w_h_w(0, crc);
+ crc = __builtin_loongarch_crcc_w_w_w(0, crc);
+ crc = __builtin_loongarch_crcc_w_d_w(0, crc);
+
+ /* return computed value, to prevent the above being optimized away */
+ return crc == 0;
+}
+'''
+
+ if cc.links(prog, name: '__builtin_loongarch_crcc_w_b_w,
__builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w, and
__builtin_loongarch_crcc_w_d_w',
+ args: test_c_args)
+ # Use LoongArch CRC instruction unconditionally
+ cdata.set('USE_LOONGARCH_CRC32C', 1)
+ have_optimized_crc = true
+ endif
+
endif
if not have_optimized_crc
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index ee209d6d70..d8a2985567 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -714,6 +714,9 @@
/* Define to 1 to build with LLVM based JIT support. (--with-llvm) */
#undef USE_LLVM
+/* Define to 1 to use LoongArch CRCC instructions. */
+#undef USE_LOONGARCH_CRC32C
+
/* Define to 1 to build with LZ4 support. (--with-lz4) */
#undef USE_LZ4
diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h
index 7f8779261c..d085f1dc00 100644
--- a/src/include/port/pg_crc32c.h
+++ b/src/include/port/pg_crc32c.h
@@ -58,6 +58,15 @@ extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const
void *data, size_t le
extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t
len);
+#elif defined(USE_LOONGARCH_CRC32C)
+/* Use LoongArch CRCC instructions. */
+
+#define COMP_CRC32C(crc, data, len)
\
+ ((crc) = pg_comp_crc32c_loongarch((crc), (data), (len)))
+#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
+
+extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data,
size_t len);
+
#elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) ||
defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK)
/*
diff --git a/src/port/meson.build b/src/port/meson.build
index 9d0cd93c43..deb354418d 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -92,6 +92,9 @@ replace_funcs_pos = [
['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'],
['pg_crc32c_sb8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'],
+ # loongarch
+ ['pg_crc32c_loongarch', 'USE_LOONGARCH_CRC32C'],
+
# generic fallback
['pg_crc32c_sb8', 'USE_SLICING_BY_8_CRC32C'],
]
diff --git a/src/port/pg_crc32c_loongarch.c b/src/port/pg_crc32c_loongarch.c
new file mode 100644
index 0000000000..2897920800
--- /dev/null
+++ b/src/port/pg_crc32c_loongarch.c
@@ -0,0 +1,73 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_crc32c_loongarch.c
+ * Compute CRC-32C checksum using LoongArch CRCC instructions
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/port/pg_crc32c_loongarch.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "c.h"
+
+#include "port/pg_crc32c.h"
+
+pg_crc32c
+pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len)
+{
+ const unsigned char *p = data;
+ const unsigned char *pend = p + len;
+
+ /*
+ * Loongarch desktop and server chips support unaligned memory access
by default.
+ * However, aligned memory access is significantly faster.
+ * Process leading bytes so that the loop below starts with a pointer
aligned to eight bytes.
+ */
+ if (!PointerIsAligned(p, uint16) &&
+ p + 1 <= pend)
+ {
+ crc = __builtin_loongarch_crcc_w_b_w(*p, crc);
+ p += 1;
+ }
+ if (!PointerIsAligned(p, uint32) &&
+ p + 2 <= pend)
+ {
+ crc = __builtin_loongarch_crcc_w_h_w(*(uint16 *) p, crc);
+ p += 2;
+ }
+ if (!PointerIsAligned(p, uint64) &&
+ p + 4 <= pend)
+ {
+ crc = __builtin_loongarch_crcc_w_w_w(*(uint32 *) p, crc);
+ p += 4;
+ }
+
+ /* Process eight bytes at a time, as far as we can. */
+ while (p + 8 <= pend)
+ {
+ crc = __builtin_loongarch_crcc_w_d_w(*(uint64 *) p, crc);
+ p += 8;
+ }
+
+ /* Process remaining 0-7 bytes. */
+ if (p + 4 <= pend)
+ {
+ crc = __builtin_loongarch_crcc_w_w_w(*(uint32 *) p, crc);
+ p += 4;
+ }
+ if (p + 2 <= pend)
+ {
+ crc = __builtin_loongarch_crcc_w_h_w(*(uint16 *) p, crc);
+ p += 2;
+ }
+ if (p < pend)
+ {
+ crc = __builtin_loongarch_crcc_w_b_w(*p, crc);
+ }
+
+ return crc;
+}
--
2.41.0