From f65d25d95d31a0b1c362c9e473bb26454280f43c Mon Sep 17 00:00:00 2001 From: Aayush Khanna Date: Tue, 23 Jun 2026 01:06:20 +0530 Subject: [PATCH 1/5] feat: add highway as a dep and do a poc for SIMD kernel --- type: pre_commit_static_analysis_report description: Results of running static analysis checks when committing changes. report: - task: lint_filenames status: passed - task: lint_editorconfig status: passed - task: lint_markdown_pkg_readmes status: na - task: lint_markdown_docs status: passed - task: lint_markdown status: passed - task: lint_package_json status: na - task: lint_repl_help status: na - task: lint_javascript_src status: na - task: lint_javascript_cli status: na - task: lint_javascript_examples status: na - task: lint_javascript_tests status: na - task: lint_javascript_benchmarks status: na - task: lint_python status: na - task: lint_r status: na - task: lint_c_src status: na - task: lint_c_examples status: na - task: lint_c_benchmarks status: na - task: lint_c_tests_fixtures status: na - task: lint_shell status: na - task: lint_typescript_declarations status: passed - task: lint_typescript_tests status: na - task: lint_license_headers status: passed --- --- deps/checksums/highway_1_2_0_tar_gz/sha256 | 1 + deps/test/highway/test_install.cpp | 58 ++++++ docs/contributing/development.md | 3 + .../base/daxpy/benchmark/cpp/highway/Makefile | 159 ++++++++++++++++ .../daxpy/benchmark/cpp/highway/benchmark.cpp | 161 +++++++++++++++++ .../@stdlib/blas/base/daxpy/binding.gyp | 17 +- .../@stdlib/blas/base/daxpy/include.gypi | 6 +- .../@stdlib/blas/base/daxpy/manifest.json | 48 +++++ .../base/daxpy/src/{addon.c => addon.cpp} | 11 +- .../@stdlib/blas/base/daxpy/src/daxpy_hwy.cpp | 90 ++++++++++ tools/make/common.mk | 17 ++ tools/make/lib/benchmark/cpp.mk | 6 +- tools/make/lib/install/Makefile | 7 +- tools/make/lib/install/README.md | 33 ++++ tools/make/lib/install/addons.mk | 3 + tools/make/lib/install/highway.mk | 170 ++++++++++++++++++ 16 files changed, 782 insertions(+), 8 deletions(-) create mode 100644 deps/checksums/highway_1_2_0_tar_gz/sha256 create mode 100644 deps/test/highway/test_install.cpp create mode 100644 lib/node_modules/@stdlib/blas/base/daxpy/benchmark/cpp/highway/Makefile create mode 100644 lib/node_modules/@stdlib/blas/base/daxpy/benchmark/cpp/highway/benchmark.cpp rename lib/node_modules/@stdlib/blas/base/daxpy/src/{addon.c => addon.cpp} (84%) create mode 100644 lib/node_modules/@stdlib/blas/base/daxpy/src/daxpy_hwy.cpp create mode 100644 tools/make/lib/install/highway.mk diff --git a/deps/checksums/highway_1_2_0_tar_gz/sha256 b/deps/checksums/highway_1_2_0_tar_gz/sha256 new file mode 100644 index 000000000000..0a2de9c796df --- /dev/null +++ b/deps/checksums/highway_1_2_0_tar_gz/sha256 @@ -0,0 +1 @@ +7e0be78b8318e8bdbf6fa545d2ecb4c90f947df03f7aadc42c1967f019e63343 diff --git a/deps/test/highway/test_install.cpp b/deps/test/highway/test_install.cpp new file mode 100644 index 000000000000..89c12af5796c --- /dev/null +++ b/deps/test/highway/test_install.cpp @@ -0,0 +1,58 @@ +/** +* @license Apache-2.0 +* +* Copyright (c) 2026 The Stdlib Authors. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#include + +#include + +HWY_BEFORE_NAMESPACE(); +namespace HWY_NAMESPACE { + namespace hn = hwy::HWY_NAMESPACE; + + double SumLanes(const double* HWY_RESTRICT values, size_t num) { + const hn::ScalableTag d; + hn::Vec sum = hn::Set(d, 0.0); + size_t i = 0; + for (; i + hn::Lanes(d) <= num; i += hn::Lanes(d)) { + sum = hn::Add(sum, hn::LoadU(d, values + i)); + } + // Reduce the vector lanes to a single scalar. + alignas(64) double buf[64]; + hn::Store(sum, d, buf); + double result = 0.0; + for (size_t j = 0; j < hn::Lanes(d); ++j) { + result += buf[j]; + } + // Scalar tail for remaining elements. + for (; i < num; ++i) { + result += values[i]; + } + return result; + } +} // namespace HWY_NAMESPACE +HWY_AFTER_NAMESPACE(); + +int main() { + double values[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}; + const double sum = HWY_STATIC_DISPATCH(SumLanes)(values, sizeof(values) / sizeof(values[0])); + + std::printf("Highway installed successfully!\n"); + std::printf("Detected target: %s\n", hwy::TargetName(HWY_TARGET)); + std::printf("Sum of [1..8] = %0.1f\n", sum); + return 0; +} diff --git a/docs/contributing/development.md b/docs/contributing/development.md index 540f31088fc6..0b65be15cc61 100644 --- a/docs/contributing/development.md +++ b/docs/contributing/development.md @@ -78,6 +78,7 @@ The following external libraries can be automatically downloaded and compiled fr - [Boost][boost]: portable C++ libraries - [Cephes][cephes]: C/C++ special functions math library +- [Highway][highway]: Google's portable SIMD library - [OpenBLAS][openblas]: optimized BLAS library - [Electron][electron]: framework for cross-platform desktop applications - [Emscripten][emscripten]: LLVM to JavaScript compiler @@ -406,6 +407,8 @@ For contribution guidelines, see the [contributing guide][stdlib-contributing]. [cephes]: http://www.moshier.net/#Cephes +[highway]: https://github.com/google/highway + [openblas]: https://github.com/xianyi/OpenBLAS [electron]: https://www.electronjs.org/ diff --git a/lib/node_modules/@stdlib/blas/base/daxpy/benchmark/cpp/highway/Makefile b/lib/node_modules/@stdlib/blas/base/daxpy/benchmark/cpp/highway/Makefile new file mode 100644 index 000000000000..22fea8529cb1 --- /dev/null +++ b/lib/node_modules/@stdlib/blas/base/daxpy/benchmark/cpp/highway/Makefile @@ -0,0 +1,159 @@ +#/ +# @license Apache-2.0 +# +# Copyright (c) 2026 The Stdlib Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#/ + +# VARIABLES # + +ifndef VERBOSE + QUIET := @ +else + QUIET := +endif + +# Determine the OS ([1][1], [2][2]). +# +# [1]: https://en.wikipedia.org/wiki/Uname#Examples +# [2]: http://stackoverflow.com/a/27776822/2225624 +OS ?= $(shell uname) +ifneq (, $(findstring MINGW,$(OS))) + OS := WINNT +else +ifneq (, $(findstring MSYS,$(OS))) + OS := WINNT +else +ifneq (, $(findstring CYGWIN,$(OS))) + OS := WINNT +else +ifneq (, $(findstring Windows_NT,$(OS))) + OS := WINNT +endif +endif +endif +endif + +# Define the program used for compiling C++ source files: +ifdef CXX_COMPILER + CXX := $(CXX_COMPILER) +else + CXX := g++ +endif + +# Define the command-line options when compiling C++ files: +CXXFLAGS ?= \ + -std=c++17 \ + -O3 \ + -Wall \ + -pedantic + +# Determine whether to generate position independent code ([1][1], [2][2]). +# +# [1]: https://gcc.gnu.org/onlinedocs/gcc/Code-Gen-Options.html#Code-Gen-Options +# [2]: http://stackoverflow.com/questions/5311515/gcc-fpic-option +ifeq ($(OS), WINNT) + fPIC ?= +else + fPIC ?= -fPIC +endif + +# Define the package root: +pkg_dir := $(abspath $(dir $(lastword $(MAKEFILE_LIST)))/../../..) + +# List of includes (e.g., `-I /foo/bar -I /beep/boop/include`): +INCLUDE ?= \ + -I$(pkg_dir)/include \ + -I$(pkg_dir)/../shared/include \ + -I$(pkg_dir)/../diagonal-types/include \ + -I$(pkg_dir)/../layouts/include \ + -I$(pkg_dir)/../matrix-orientations/include \ + -I$(pkg_dir)/../matrix-triangles/include \ + -I$(pkg_dir)/../operation-sides/include \ + -I$(pkg_dir)/../transpose-operations/include \ + -I$(pkg_dir)/../../strided/base/stride2offset/include \ + -I$(HIGHWAY) + +# List of source files: +SOURCE_FILES ?= $(pkg_dir)/src/daxpy_hwy.cpp + +# List of libraries (e.g., `-lopenblas -lpthread`): +LIBRARIES ?= + +# List of library paths (e.g., `-L /foo/bar -L /beep/boop`): +LIBPATH ?= + +# List of C++ targets: +cxx_targets := benchmark.out + + +# RULES # + +#/ +# Compiles source files. +# +# @param {string} [CXX_COMPILER] - C++ compiler (e.g., `g++`) +# @param {string} [CXXFLAGS] - C++ compiler options +# @param {(string|void)} [fPIC] - compiler flag determining whether to generate position independent code (e.g., `-fPIC`) +# @param {string} [INCLUDE] - list of includes (e.g., `-I /foo/bar -I /beep/boop/include`) +# @param {string} [SOURCE_FILES] - list of source files +# @param {string} [LIBPATH] - list of library paths (e.g., `-L /foo/bar -L /beep/boop`) +# @param {string} [LIBRARIES] - list of libraries (e.g., `-lopenblas -lpthread`) +# +# @example +# make +# +# @example +# make all +#/ +all: $(cxx_targets) + +.PHONY: all + +#/ +# Compiles C++ source files. +# +# @private +# @param {string} CXX - C++ compiler (e.g., `g++`) +# @param {string} CXXFLAGS - C++ compiler options +# @param {(string|void)} fPIC - compiler flag determining whether to generate position independent code (e.g., `-fPIC`) +# @param {string} INCLUDE - list of includes (e.g., `-I /foo/bar`) +# @param {string} SOURCE_FILES - list of source files +# @param {string} LIBPATH - list of library paths (e.g., `-L /foo/bar`) +# @param {string} LIBRARIES - list of libraries (e.g., `-lopenblas`) +#/ +$(cxx_targets): %.out: %.cpp + $(QUIET) $(CXX) $(CXXFLAGS) $(fPIC) $(INCLUDE) -o $@ $(SOURCE_FILES) $< $(LIBPATH) -lm $(LIBRARIES) + +#/ +# Runs compiled benchmarks. +# +# @example +# make run +#/ +run: $(cxx_targets) + $(QUIET) ./$< + +.PHONY: run + +#/ +# Removes generated files. +# +# @example +# make clean +#/ +clean: + $(QUIET) -rm -f *.o *.out + +.PHONY: clean diff --git a/lib/node_modules/@stdlib/blas/base/daxpy/benchmark/cpp/highway/benchmark.cpp b/lib/node_modules/@stdlib/blas/base/daxpy/benchmark/cpp/highway/benchmark.cpp new file mode 100644 index 000000000000..c9d35cdd7ad9 --- /dev/null +++ b/lib/node_modules/@stdlib/blas/base/daxpy/benchmark/cpp/highway/benchmark.cpp @@ -0,0 +1,161 @@ +/** +* @license Apache-2.0 +* +* Copyright (c) 2026 The Stdlib Authors. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#include +#include +#include +#include +#include +#include + +#ifndef NAME +#define NAME "daxpy" +#endif + +/** +* Highway-accelerated daxpy (forward declaration; defined in daxpy_hwy.cpp). +*/ +extern "C" void daxpy_hwy( int64_t N, double alpha, const double *X, int64_t strideX, double *Y, int64_t strideY ); + +#define ITERATIONS 10000000 +#define REPEATS 3 +#define MIN 1 +#define MAX 6 + +/** +* Prints the TAP version. +*/ +static void print_version( void ) { + printf( "TAP version 13\n" ); +} + +/** +* Prints the TAP summary. +* +* @param total total number of tests +* @param passing total number of passing tests +*/ +static void print_summary( int total, int passing ) { + printf( "#\n" ); + printf( "1..%d\n", total ); // TAP plan + printf( "# total %d\n", total ); + printf( "# pass %d\n", passing ); + printf( "#\n" ); + printf( "# ok\n" ); +} + +/** +* Prints benchmarks results. +* +* @param iterations number of iterations +* @param elapsed elapsed time in seconds +*/ +static void print_results( int iterations, double elapsed ) { + double rate = (double)iterations / elapsed; + printf( " ---\n" ); + printf( " iterations: %d\n", iterations ); + printf( " elapsed: %0.9f\n", elapsed ); + printf( " rate: %0.9f\n", rate ); + printf( " ...\n" ); +} + +/** +* Returns a clock time. +*/ +static double tic( void ) { + struct timeval now; + gettimeofday( &now, NULL ); + return (double)now.tv_sec + (double)now.tv_usec/1.0e6; +} + +/** +* Generates a random number on the interval [0,1). +* +* @return random number +*/ +static double rand_double( void ) { + int r = rand(); + return (double)r / ( (double)RAND_MAX + 1.0 ); +} + +/** +* Runs a benchmark. +* +* @param iterations number of iterations +* @param len array length +* @return elapsed time in seconds +*/ +static double benchmark_hwy( int iterations, int len ) { + double elapsed; + double *x; + double *y; + double t; + int i; + + x = (double *) malloc( len * sizeof( double ) ); + y = (double *) malloc( len * sizeof( double ) ); + for ( i = 0; i < len; i++ ) { + x[ i ] = ( rand_double()*200.0 ) - 100.0; + y[ i ] = ( rand_double()*200.0 ) - 100.0; + } + t = tic(); + for ( i = 0; i < iterations; i++ ) { + daxpy_hwy( len, 5.0, x, 1, y, 1 ); + if ( y[ 0 ] != y[ 0 ] ) { + printf( "should not return NaN\n" ); + break; + } + } + elapsed = tic() - t; + if ( y[ 0 ] != y[ 0 ] ) { + printf( "should not return NaN\n" ); + } + free( x ); + free( y ); + return elapsed; +} + +/** +* Main execution sequence. +*/ +int main( void ) { + double elapsed; + int count; + int iter; + int len; + int i; + int j; + + // Use the current time to seed the random number generator: + srand( time( NULL ) ); + + print_version(); + count = 0; + for ( i = MIN; i <= MAX; i++ ) { + len = pow( 10, i ); + iter = ITERATIONS / pow( 10, i-1 ); + for ( j = 0; j < REPEATS; j++ ) { + count += 1; + printf( "# cpp::highway::%s:len=%d\n", NAME, len ); + elapsed = benchmark_hwy( iter, len ); + print_results( iter, elapsed ); + printf( "ok %d benchmark finished\n", count ); + } + } + print_summary( count, count ); +} diff --git a/lib/node_modules/@stdlib/blas/base/daxpy/binding.gyp b/lib/node_modules/@stdlib/blas/base/daxpy/binding.gyp index 864d9109e892..f1a76e8e91fe 100644 --- a/lib/node_modules/@stdlib/blas/base/daxpy/binding.gyp +++ b/lib/node_modules/@stdlib/blas/base/daxpy/binding.gyp @@ -32,6 +32,9 @@ # Fortran compiler (to override -Dfortran_compiler=): 'fortran_compiler%': 'gfortran', + # Highway SIMD backend flag (to override -Dhighway=1): + 'highway%': 0, + # Fortran compiler flags: 'fflags': [ # Specify the Fortran standard to which a program is expected to conform: @@ -100,6 +103,9 @@ '<@(src_files)', ], + # Preprocessor definitions: + 'defines': [], + # Settings which should be applied when a target's object files are used as linker input: 'link_settings': { # Define libraries: @@ -131,7 +137,7 @@ # C++ specific compiler flags: 'cflags_cpp': [ # Specify the C++ standard to which a program is expected to conform: - '-std=c++11', + '-std=c++17', ], # Linker flags: @@ -160,6 +166,15 @@ ], }, ], # end condition (OS!="win") + [ + 'highway==1', + { + # Define preprocessor macro to dispatch to Highway SIMD kernel: + 'defines': [ + 'BLAS_HIGHWAY', + ], + }, + ], # end condition (highway==1) ], # end conditions # Define custom build actions for particular inputs: diff --git a/lib/node_modules/@stdlib/blas/base/daxpy/include.gypi b/lib/node_modules/@stdlib/blas/base/daxpy/include.gypi index 3bfc9e282aab..102449f8d373 100644 --- a/lib/node_modules/@stdlib/blas/base/daxpy/include.gypi +++ b/lib/node_modules/@stdlib/blas/base/daxpy/include.gypi @@ -36,6 +36,9 @@ # Path to BLAS library (to override -Dblas_dir=): 'blas_dir%': '', + + # Path to Highway SIMD library include directory (to override -Dhighway_include=): + 'highway_include%': '', }, # end variables # Source directory: @@ -44,6 +47,7 @@ # Include directories: 'include_dirs': [ '<@(blas_dir)', + '<@(highway_include)', ' +// When building with the Highway SIMD backend (BLAS=highway), dispatch to the +// Highway-accelerated kernel instead of the reference C implementation: +#ifdef BLAS_HIGHWAY +extern "C" void daxpy_hwy( CBLAS_INT N, double alpha, const double *X, CBLAS_INT strideX, double *Y, CBLAS_INT strideY ); +#define DAXPY_DISPATCH daxpy_hwy +#else +#define DAXPY_DISPATCH API_SUFFIX(c_daxpy) +#endif + /** * Receives JavaScript callback invocation data. * @@ -40,7 +49,7 @@ static napi_value addon( napi_env env, napi_callback_info info ) { STDLIB_NAPI_ARGV_INT64( env, strideY, argv, 5 ); STDLIB_NAPI_ARGV_STRIDED_FLOAT64ARRAY( env, X, N, strideX, argv, 2 ); STDLIB_NAPI_ARGV_STRIDED_FLOAT64ARRAY( env, Y, N, strideY, argv, 4 ); - API_SUFFIX(c_daxpy)( N, alpha, X, strideX, Y, strideY ); + DAXPY_DISPATCH( N, alpha, X, strideX, Y, strideY ); return NULL; } diff --git a/lib/node_modules/@stdlib/blas/base/daxpy/src/daxpy_hwy.cpp b/lib/node_modules/@stdlib/blas/base/daxpy/src/daxpy_hwy.cpp new file mode 100644 index 000000000000..0e39b310dfab --- /dev/null +++ b/lib/node_modules/@stdlib/blas/base/daxpy/src/daxpy_hwy.cpp @@ -0,0 +1,90 @@ +/** +* @license Apache-2.0 +* +* Copyright (c) 2026 The Stdlib Authors. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#include +#include "stdlib/blas/base/daxpy.h" +#include "stdlib/blas/base/shared.h" + +HWY_BEFORE_NAMESPACE(); +namespace HWY_NAMESPACE { + namespace hn = hwy::HWY_NAMESPACE; + + void daxpy_kernel_simd( CBLAS_INT N, double alpha, const double *X, double *Y ) { + const hn::ScalableTag d; + const auto alpha_vec = hn::Set( d, alpha ); + + // Process in SIMD chunks + CBLAS_INT i = 0; + const CBLAS_INT lanes = static_cast( hn::Lanes( d ) ); + const CBLAS_INT max_i = N - ( N % lanes ); + + for ( ; i < max_i; i += lanes ) { + const auto x = hn::LoadU( d, X + i ); + const auto y = hn::LoadU( d, Y + i ); + const auto result = hn::MulAdd( alpha_vec, x, y ); + hn::StoreU( result, d, Y + i ); + } + + // Scalar tail for remaining elements + for ( ; i < N; ++i ) { + Y[i] += alpha * X[i]; + } + } + + void daxpy_strided_kernel( CBLAS_INT N, double alpha, const double *X, CBLAS_INT strideX, double *Y, CBLAS_INT strideY ) { + CBLAS_INT ix; + CBLAS_INT iy; + CBLAS_INT i; + + if ( alpha == 0.0 ) { + return; + } + ix = 0; + iy = 0; + for ( i = 0; i < N; i++ ) { + Y[ iy ] += alpha * X[ ix ]; + ix += strideX; + iy += strideY; + } + } +} // namespace HWY_NAMESPACE +HWY_AFTER_NAMESPACE(); + +// Highway-accelerated daxpy. Uses static dispatch to select the SIMD kernel +// compiled for the current target. Declared with C linkage so it can be called +// from both C and C++ translation units. +extern "C" void daxpy_hwy( CBLAS_INT N, double alpha, const double *X, CBLAS_INT strideX, double *Y, CBLAS_INT strideY ) { + if ( N <= 0 ) { + return; + } + + // BLAS convention: for negative strides, adjust pointers so that + // the first accessed element is at offset zero (stride2offset). + if ( strideX < 0 ) { + X += ( 1 - N ) * strideX; + } + if ( strideY < 0 ) { + Y += ( 1 - N ) * strideY; + } + + if ( strideX == 1 && strideY == 1 ) { + HWY_STATIC_DISPATCH( daxpy_kernel_simd )( N, alpha, X, Y ); + } else { + HWY_STATIC_DISPATCH( daxpy_strided_kernel )( N, alpha, X, strideX, Y, strideY ); + } +} diff --git a/tools/make/common.mk b/tools/make/common.mk index f570982f637e..5d68a87fc743 100644 --- a/tools/make/common.mk +++ b/tools/make/common.mk @@ -699,3 +699,20 @@ deps_fftpack_version_slug := $(subst .,_,$(DEPS_FFTPACK_VERSION)) # Define the output path when building FFTPACK: DEPS_FFTPACK_BUILD_OUT ?= $(DEPS_BUILD_DIR)/pffft-$(DEPS_FFTPACK_VERSION) + +# Highway (Google SIMD library)... + +# Define the Highway version: +DEPS_HIGHWAY_VERSION ?= 1.2.0 + +# Generate a version slug: +deps_highway_version_slug := $(subst .,_,$(DEPS_HIGHWAY_VERSION)) + +# Define the output path when building Highway: +DEPS_HIGHWAY_BUILD_OUT ?= $(DEPS_BUILD_DIR)/highway_$(deps_highway_version_slug) + +# Define the include path for Highway: +DEPS_HIGHWAY_INCLUDE ?= $(DEPS_HIGHWAY_BUILD_OUT) + +# Define the path to Highway header source directory: +DEPS_HIGHWAY_SRC ?= $(DEPS_HIGHWAY_BUILD_OUT)/hwy diff --git a/tools/make/lib/benchmark/cpp.mk b/tools/make/lib/benchmark/cpp.mk index d3c41d4c503a..cd99bd307e35 100644 --- a/tools/make/lib/benchmark/cpp.mk +++ b/tools/make/lib/benchmark/cpp.mk @@ -43,7 +43,8 @@ benchmark-cpp: cd `dirname $$file` && \ $(MAKE) clean && \ CXX_COMPILER="$(CXX)" \ - BOOST="$(DEPS_BOOST_BUILD_OUT)" $(MAKE) && \ + BOOST="$(DEPS_BOOST_BUILD_OUT)" \ + HIGHWAY="$(DEPS_HIGHWAY_INCLUDE)" $(MAKE) && \ $(MAKE) run || exit 1; \ done @@ -71,7 +72,8 @@ benchmark-cpp-files: cd `dirname $$file` && \ $(MAKE) clean && \ CXX_COMPILER="$(CXX)" \ - BOOST="$(DEPS_BOOST_BUILD_OUT)" $(MAKE) && \ + BOOST="$(DEPS_BOOST_BUILD_OUT)" \ + HIGHWAY="$(DEPS_HIGHWAY_INCLUDE)" $(MAKE) && \ $(MAKE) run || exit 1; \ done diff --git a/tools/make/lib/install/Makefile b/tools/make/lib/install/Makefile index be8363f97b9a..49318667a533 100644 --- a/tools/make/lib/install/Makefile +++ b/tools/make/lib/install/Makefile @@ -40,6 +40,7 @@ include $(TOOLS_MAKE_LIB_DIR)/install/cppcheck.mk include $(TOOLS_MAKE_LIB_DIR)/install/electron.mk include $(TOOLS_MAKE_LIB_DIR)/install/emsdk.mk include $(TOOLS_MAKE_LIB_DIR)/install/fftpack.mk +include $(TOOLS_MAKE_LIB_DIR)/install/highway.mk include $(TOOLS_MAKE_LIB_DIR)/install/llvm.mk include $(TOOLS_MAKE_LIB_DIR)/install/node.mk include $(TOOLS_MAKE_LIB_DIR)/install/openblas.mk @@ -158,7 +159,7 @@ clean-deps-tests: clean-deps-openblas-tests # @example # make install-deps-dev #/ -install-deps-dev: install-deps-boost install-deps-cephes install-deps-cppcheck install-deps-fftpack install-deps-python install-deps-r install-deps-shellcheck +install-deps-dev: install-deps-boost install-deps-cephes install-deps-cppcheck install-deps-fftpack install-deps-highway install-deps-python install-deps-r install-deps-shellcheck .PHONY: install-deps-dev @@ -168,7 +169,7 @@ install-deps-dev: install-deps-boost install-deps-cephes install-deps-cppcheck i # @example # make clean-deps-dev #/ -clean-deps-dev: clean-deps-boost clean-deps-cephes clean-deps-cppcheck clean-deps-fftpack clean-deps-python clean-deps-r clean-deps-shellcheck +clean-deps-dev: clean-deps-boost clean-deps-cephes clean-deps-cppcheck clean-deps-fftpack clean-deps-highway clean-deps-python clean-deps-r clean-deps-shellcheck .PHONY: clean-deps-dev @@ -178,7 +179,7 @@ clean-deps-dev: clean-deps-boost clean-deps-cephes clean-deps-cppcheck clean-dep # @example # make clean-deps-dev-tests #/ -clean-deps-dev-tests: clean-deps-boost-tests clean-deps-cephes-tests clean-deps-cppcheck-tests clean-deps-fftpack-tests clean-deps-shellcheck-tests +clean-deps-dev-tests: clean-deps-boost-tests clean-deps-cephes-tests clean-deps-cppcheck-tests clean-deps-fftpack-tests clean-deps-highway-tests clean-deps-shellcheck-tests .PHONY: clean-deps-dev-tests diff --git a/tools/make/lib/install/README.md b/tools/make/lib/install/README.md index 87c8c37d14ef..e35cf413893b 100644 --- a/tools/make/lib/install/README.md +++ b/tools/make/lib/install/README.md @@ -37,6 +37,7 @@ This directory contains [`make`][make] rules for running the project's installat - [Cppcheck](#cppcheck) - [Electron](#electron) - [Emscripten SDK](#emscripten-sdk) + - [Highway](#highway) - [LLVM](#llvm) - [OpenBLAS](#openblas) - [Python](#python) @@ -421,6 +422,36 @@ $ make clean-deps-emscripten-tests * * * + + +### Highway + +#### install-deps-highway + +Installs [Highway][highway]. + +```bash +$ make install-deps-highway +``` + +#### clean-deps-highway + +Removes an installed [Highway][highway] distribution. + +```bash +$ make clean-deps-highway +``` + +#### clean-deps-highway-tests + +Removes compiled [Highway][highway] installation tests. + +```bash +$ make clean-deps-highway-tests +``` + +* * * + ### LLVM @@ -657,6 +688,8 @@ $ make clean-deps-wasi-libc-tests [emscripten-sdk]: https://github.com/emscripten-core/emsdk +[highway]: https://github.com/google/highway + [llvm]: https://llvm.org [node-js]: https://nodejs.org/en/ diff --git a/tools/make/lib/install/addons.mk b/tools/make/lib/install/addons.mk index 9aaee2253556..1405444f1e69 100644 --- a/tools/make/lib/install/addons.mk +++ b/tools/make/lib/install/addons.mk @@ -34,6 +34,9 @@ ifneq (, $(BLAS)) ifdef BLAS_DIR NODE_GYP_DEFINES += blas_dir=$(BLAS_DIR) endif +ifeq ($(BLAS), highway) + NODE_GYP_DEFINES += highway_include=$(DEPS_HIGHWAY_INCLUDE) highway=1 +endif endif endif diff --git a/tools/make/lib/install/highway.mk b/tools/make/lib/install/highway.mk new file mode 100644 index 000000000000..0fdf31ff9654 --- /dev/null +++ b/tools/make/lib/install/highway.mk @@ -0,0 +1,170 @@ +#/ +# @license Apache-2.0 +# +# Copyright (c) 2026 The Stdlib Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#/ + +# VARIABLES # + +# Define the download URL: +DEPS_HIGHWAY_URL ?= https://codeload.github.com/google/highway/tar.gz/$(DEPS_HIGHWAY_VERSION) + +# Determine the basename for the download: +deps_highway_basename := highway-$(DEPS_HIGHWAY_VERSION).tar.gz + +# Determine the directory name created by extracting the archive: +deps_highway_extract_out := $(DEPS_BUILD_DIR)/highway-$(DEPS_HIGHWAY_VERSION) + +# Define the path to the file containing a checksum to verify a download: +DEPS_HIGHWAY_CHECKSUM ?= $(shell $(CAT) $(DEPS_CHECKSUMS_DIR)/$(subst .,_,$(subst -,_,$(deps_highway_basename)))/sha256) + +# Define the output path when downloading: +DEPS_HIGHWAY_DOWNLOAD_OUT ?= $(DEPS_TMP_DIR)/$(deps_highway_basename) + +# Define the path to the directory containing tests: +DEPS_HIGHWAY_TEST_DIR ?= $(DEPS_DIR)/test/highway + +# Define the output directory path for compiled tests: +DEPS_HIGHWAY_TEST_OUT ?= $(DEPS_HIGHWAY_TEST_DIR)/build + +# Define the path to a test file for checking an installation: +DEPS_HIGHWAY_TEST_INSTALL ?= $(DEPS_HIGHWAY_TEST_DIR)/test_install.cpp + +# Define the output path for a test file: +DEPS_HIGHWAY_TEST_INSTALL_OUT ?= $(DEPS_HIGHWAY_TEST_OUT)/test_install + + +# RULES # + +#/ +# Downloads a Highway distribution. +# +# @private +#/ +$(DEPS_HIGHWAY_DOWNLOAD_OUT): | $(DEPS_TMP_DIR) + $(QUIET) echo 'Downloading Highway...' >&2 + $(QUIET) $(DEPS_DOWNLOAD_BIN) $(DEPS_HIGHWAY_URL) $(DEPS_HIGHWAY_DOWNLOAD_OUT) + +#/ +# Extracts a Highway gzipped tar archive. +# +# @private +#/ +$(DEPS_HIGHWAY_BUILD_OUT): $(DEPS_HIGHWAY_DOWNLOAD_OUT) | $(DEPS_BUILD_DIR) + $(QUIET) echo 'Extracting Highway...' >&2 + $(QUIET) $(TAR) -zxf $(DEPS_HIGHWAY_DOWNLOAD_OUT) -C $(DEPS_BUILD_DIR) + $(QUIET) mv $(deps_highway_extract_out) $(DEPS_HIGHWAY_BUILD_OUT) + +#/ +# Creates a directory for storing compiled tests. +# +# @private +#/ +$(DEPS_HIGHWAY_TEST_OUT): + $(QUIET) $(MKDIR_RECURSIVE) $(DEPS_HIGHWAY_TEST_OUT) + +#/ +# Compiles a test file for testing a Highway installation. +# +# @private +#/ +$(DEPS_HIGHWAY_TEST_INSTALL_OUT): $(DEPS_HIGHWAY_BUILD_OUT) $(DEPS_HIGHWAY_TEST_OUT) + $(QUIET) $(CXX) -std=c++17 -O3 -Wall -pedantic -I $(DEPS_HIGHWAY_BUILD_OUT) $(DEPS_HIGHWAY_TEST_INSTALL) -o $(DEPS_HIGHWAY_TEST_INSTALL_OUT) + +#/ +# Downloads a Highway distribution. +# +# @private +# +# @example +# make deps-download-highway +#/ +deps-download-highway: $(DEPS_HIGHWAY_DOWNLOAD_OUT) + +.PHONY: deps-download-highway + +#/ +# Verifies a downloaded Highway distribution. +# +# @private +# +# @example +# make deps-verify-highway +#/ +deps-verify-highway: deps-download-highway + $(QUIET) echo 'Verifying download...' >&2 + $(QUIET) $(DEPS_CHECKSUM_BIN) $(DEPS_HIGHWAY_DOWNLOAD_OUT) $(DEPS_HIGHWAY_CHECKSUM) >&2 + +.PHONY: deps-verify-highway + +#/ +# Extracts a downloaded Highway distribution. +# +# @private +# +# @example +# make deps-extract-highway +#/ +deps-extract-highway: $(DEPS_HIGHWAY_BUILD_OUT) + +.PHONY: deps-extract-highway + +#/ +# Tests an installed Highway distribution. +# +# @private +# +# @example +# make deps-test-highway +#/ +deps-test-highway: $(DEPS_HIGHWAY_TEST_INSTALL_OUT) + $(QUIET) echo 'Running tests...' >&2 + $(QUIET) $(DEPS_HIGHWAY_TEST_INSTALL_OUT) + $(QUIET) echo '' >&2 + $(QUIET) echo 'Success.' >&2 + +.PHONY: deps-test-highway + +#/ +# Installs Highway. +# +# @example +# make install-deps-highway +#/ +install-deps-highway: deps-download-highway deps-verify-highway deps-extract-highway deps-test-highway + +.PHONY: install-deps-highway + +#/ +# Removes an installed Highway distribution. +# +# @example +# make clean-deps-highway +#/ +clean-deps-highway: clean-deps-highway-tests + $(QUIET) $(DELETE) $(DELETE_FLAGS) $(DEPS_HIGHWAY_BUILD_OUT) + +.PHONY: clean-deps-highway + +#/ +# Removes compiled Highway installation tests. +# +# @example +# make clean-deps-highway-tests +#/ +clean-deps-highway-tests: + $(QUIET) $(DELETE) $(DELETE_FLAGS) $(DEPS_HIGHWAY_TEST_OUT) + +.PHONY: clean-deps-highway-tests From 70879a426d5cf3169b19ec20ff8e261d29a1872a Mon Sep 17 00:00:00 2001 From: Aayush Khanna Date: Wed, 24 Jun 2026 22:35:23 +0530 Subject: [PATCH 2/5] feat: add dynamic dispatch for simd kernels to auto pick target at runtime --- type: pre_commit_static_analysis_report description: Results of running static analysis checks when committing changes. report: - task: lint_filenames status: passed - task: lint_editorconfig status: passed - task: lint_markdown_pkg_readmes status: na - task: lint_markdown_docs status: na - task: lint_markdown status: na - task: lint_package_json status: na - task: lint_repl_help status: na - task: lint_javascript_src status: na - task: lint_javascript_cli status: na - task: lint_javascript_examples status: na - task: lint_javascript_tests status: na - task: lint_javascript_benchmarks status: na - task: lint_python status: na - task: lint_r status: na - task: lint_c_src status: na - task: lint_c_examples status: na - task: lint_c_benchmarks status: na - task: lint_c_tests_fixtures status: na - task: lint_shell status: na - task: lint_typescript_declarations status: passed - task: lint_typescript_tests status: na - task: lint_license_headers status: passed --- --- .../base/daxpy/benchmark/cpp/highway/Makefile | 4 +- .../daxpy/benchmark/cpp/highway/benchmark.cpp | 9 +-- .../include/stdlib/blas/base/daxpy_hwy.h | 41 ++++++++++ .../stdlib/blas/base/daxpy_kernel_inl.h | 72 ++++++++++++++++++ .../@stdlib/blas/base/daxpy/src/daxpy_hwy.cpp | 74 ++++--------------- 5 files changed, 135 insertions(+), 65 deletions(-) create mode 100644 lib/node_modules/@stdlib/blas/base/daxpy/include/stdlib/blas/base/daxpy_hwy.h create mode 100644 lib/node_modules/@stdlib/blas/base/daxpy/include/stdlib/blas/base/daxpy_kernel_inl.h diff --git a/lib/node_modules/@stdlib/blas/base/daxpy/benchmark/cpp/highway/Makefile b/lib/node_modules/@stdlib/blas/base/daxpy/benchmark/cpp/highway/Makefile index 22fea8529cb1..63a5b6c64a06 100644 --- a/lib/node_modules/@stdlib/blas/base/daxpy/benchmark/cpp/highway/Makefile +++ b/lib/node_modules/@stdlib/blas/base/daxpy/benchmark/cpp/highway/Makefile @@ -86,7 +86,9 @@ INCLUDE ?= \ -I$(HIGHWAY) # List of source files: -SOURCE_FILES ?= $(pkg_dir)/src/daxpy_hwy.cpp +SOURCE_FILES ?= \ + $(pkg_dir)/src/daxpy_hwy.cpp \ + $(HIGHWAY)/hwy/targets.cc # List of libraries (e.g., `-lopenblas -lpthread`): LIBRARIES ?= diff --git a/lib/node_modules/@stdlib/blas/base/daxpy/benchmark/cpp/highway/benchmark.cpp b/lib/node_modules/@stdlib/blas/base/daxpy/benchmark/cpp/highway/benchmark.cpp index c9d35cdd7ad9..a62c9b9a5da1 100644 --- a/lib/node_modules/@stdlib/blas/base/daxpy/benchmark/cpp/highway/benchmark.cpp +++ b/lib/node_modules/@stdlib/blas/base/daxpy/benchmark/cpp/highway/benchmark.cpp @@ -23,15 +23,12 @@ #include #include +#include "stdlib/blas/base/daxpy_hwy.h" + #ifndef NAME #define NAME "daxpy" #endif -/** -* Highway-accelerated daxpy (forward declaration; defined in daxpy_hwy.cpp). -*/ -extern "C" void daxpy_hwy( int64_t N, double alpha, const double *X, int64_t strideX, double *Y, int64_t strideY ); - #define ITERATIONS 10000000 #define REPEATS 3 #define MIN 1 @@ -115,7 +112,7 @@ static double benchmark_hwy( int iterations, int len ) { } t = tic(); for ( i = 0; i < iterations; i++ ) { - daxpy_hwy( len, 5.0, x, 1, y, 1 ); + daxpy_simd( len, 5.0, x, 1, 0, y, 1, 0 ); if ( y[ 0 ] != y[ 0 ] ) { printf( "should not return NaN\n" ); break; diff --git a/lib/node_modules/@stdlib/blas/base/daxpy/include/stdlib/blas/base/daxpy_hwy.h b/lib/node_modules/@stdlib/blas/base/daxpy/include/stdlib/blas/base/daxpy_hwy.h new file mode 100644 index 000000000000..13ac6a6a133b --- /dev/null +++ b/lib/node_modules/@stdlib/blas/base/daxpy/include/stdlib/blas/base/daxpy_hwy.h @@ -0,0 +1,41 @@ +/** +* @license Apache-2.0 +* +* Copyright (c) 2018 The Stdlib Authors. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +/** +* Header file containing function declarations for the C interface to the CBLAS Level 1 routine `cblas_daxpy`. +*/ +#ifndef STDLIB_BLAS_BASE_DAXPY_HWY_H +#define STDLIB_BLAS_BASE_DAXPY_HWY_H + +#include "stdlib/blas/base/shared.h" + +/* +* If C++, prevent name mangling so that the compiler emits a binary file having undecorated names, thus mirroring the behavior of a C compiler. +*/ +#ifdef __cplusplus +extern "C" { +#endif + +void API_SUFFIX(daxpy_simd)( CBLAS_INT N, double alpha, const double *X, const CBLAS_INT strideX, const CBLAS_INT offsetX, double *Y, const CBLAS_INT strideY, const CBLAS_INT offsetY ); + +#ifdef __cplusplus +} +#endif + +#endif // !STDLIB_BLAS_BASE_DAXPY_HWY_H + diff --git a/lib/node_modules/@stdlib/blas/base/daxpy/include/stdlib/blas/base/daxpy_kernel_inl.h b/lib/node_modules/@stdlib/blas/base/daxpy/include/stdlib/blas/base/daxpy_kernel_inl.h new file mode 100644 index 000000000000..73ae04196b3c --- /dev/null +++ b/lib/node_modules/@stdlib/blas/base/daxpy/include/stdlib/blas/base/daxpy_kernel_inl.h @@ -0,0 +1,72 @@ +/** +* @license Apache-2.0 +* +* Copyright (c) 2026 The Stdlib Authors. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#include +#include "stdlib/blas/base/shared.h" + + +HWY_BEFORE_NAMESPACE(); +namespace HWY_NAMESPACE { + namespace hn = hwy::HWY_NAMESPACE; + + void daxpy_kernel_simd( CBLAS_INT N, double alpha, const double *X, const CBLAS_INT offsetX, double *Y, const CBLAS_INT offsetY ) { + X += offsetX; + Y += offsetY; + + const hn::ScalableTag d; + const auto alpha_vec = hn::Set( d, alpha ); + + CBLAS_INT i = 0; + const CBLAS_INT lanes = static_cast( hn::Lanes( d ) ); + const CBLAS_INT max_i = N - ( N % lanes ); + + for ( ; i < max_i; i += lanes ) { + const auto x = hn::LoadU( d, X + i ); + const auto y = hn::LoadU( d, Y + i ); + const auto result = hn::MulAdd( alpha_vec, x, y ); + hn::StoreU( result, d, Y + i ); + } + + for ( ; i < N; ++i ) { + Y[i] += alpha * X[i]; + } + } + + void daxpy_strided_kernel( CBLAS_INT N, double alpha, const double *X, const CBLAS_INT strideX, const CBLAS_INT offsetX, double *Y, const CBLAS_INT strideY, const CBLAS_INT offsetY ) { + CBLAS_INT ix; + CBLAS_INT iy; + CBLAS_INT i; + + if ( N <= 0 ) { + return; + } + // If `alpha` is `0`, then `y` is unchanged... + if ( alpha == 0.0 ) { + return; + } + ix = offsetX; + iy = offsetY; + for ( i = 0; i < N; i++ ) { + Y[ iy ] += alpha * X[ ix ]; + ix += strideX; + iy += strideY; + } + return; + } +} // namespace HWY_NAMESPACE +HWY_AFTER_NAMESPACE(); diff --git a/lib/node_modules/@stdlib/blas/base/daxpy/src/daxpy_hwy.cpp b/lib/node_modules/@stdlib/blas/base/daxpy/src/daxpy_hwy.cpp index 0e39b310dfab..e8ff7c026edb 100644 --- a/lib/node_modules/@stdlib/blas/base/daxpy/src/daxpy_hwy.cpp +++ b/lib/node_modules/@stdlib/blas/base/daxpy/src/daxpy_hwy.cpp @@ -16,75 +16,33 @@ * limitations under the License. */ -#include -#include "stdlib/blas/base/daxpy.h" -#include "stdlib/blas/base/shared.h" - -HWY_BEFORE_NAMESPACE(); -namespace HWY_NAMESPACE { - namespace hn = hwy::HWY_NAMESPACE; +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "stdlib/blas/base/daxpy_kernel_inl.h" - void daxpy_kernel_simd( CBLAS_INT N, double alpha, const double *X, double *Y ) { - const hn::ScalableTag d; - const auto alpha_vec = hn::Set( d, alpha ); - - // Process in SIMD chunks - CBLAS_INT i = 0; - const CBLAS_INT lanes = static_cast( hn::Lanes( d ) ); - const CBLAS_INT max_i = N - ( N % lanes ); +#include +#include - for ( ; i < max_i; i += lanes ) { - const auto x = hn::LoadU( d, X + i ); - const auto y = hn::LoadU( d, Y + i ); - const auto result = hn::MulAdd( alpha_vec, x, y ); - hn::StoreU( result, d, Y + i ); - } +// foreach_target.h compiles the kernel for all NON-static targets. +// Now compile it for the static target that was skipped. +#include HWY_TARGET_INCLUDE - // Scalar tail for remaining elements - for ( ; i < N; ++i ) { - Y[i] += alpha * X[i]; - } - } +#include "stdlib/blas/base/shared.h" - void daxpy_strided_kernel( CBLAS_INT N, double alpha, const double *X, CBLAS_INT strideX, double *Y, CBLAS_INT strideY ) { - CBLAS_INT ix; - CBLAS_INT iy; - CBLAS_INT i; +#if HWY_ONCE - if ( alpha == 0.0 ) { - return; - } - ix = 0; - iy = 0; - for ( i = 0; i < N; i++ ) { - Y[ iy ] += alpha * X[ ix ]; - ix += strideX; - iy += strideY; - } - } -} // namespace HWY_NAMESPACE -HWY_AFTER_NAMESPACE(); +HWY_EXPORT( daxpy_kernel_simd ); +HWY_EXPORT( daxpy_strided_kernel ); -// Highway-accelerated daxpy. Uses static dispatch to select the SIMD kernel -// compiled for the current target. Declared with C linkage so it can be called -// from both C and C++ translation units. -extern "C" void daxpy_hwy( CBLAS_INT N, double alpha, const double *X, CBLAS_INT strideX, double *Y, CBLAS_INT strideY ) { +extern "C" void daxpy_simd( CBLAS_INT N, double alpha, const double *X, const CBLAS_INT strideX, const CBLAS_INT offsetX, double *Y, const CBLAS_INT strideY, const CBLAS_INT offsetY ) { if ( N <= 0 ) { return; } - // BLAS convention: for negative strides, adjust pointers so that - // the first accessed element is at offset zero (stride2offset). - if ( strideX < 0 ) { - X += ( 1 - N ) * strideX; - } - if ( strideY < 0 ) { - Y += ( 1 - N ) * strideY; - } - if ( strideX == 1 && strideY == 1 ) { - HWY_STATIC_DISPATCH( daxpy_kernel_simd )( N, alpha, X, Y ); + HWY_DYNAMIC_DISPATCH( daxpy_kernel_simd )( N, alpha, X, offsetX, Y, offsetY ); } else { - HWY_STATIC_DISPATCH( daxpy_strided_kernel )( N, alpha, X, strideX, Y, strideY ); + HWY_DYNAMIC_DISPATCH( daxpy_strided_kernel )( N, alpha, X, strideX, offsetX, Y, strideY, offsetY ); } } + +#endif From d21f7beb32828a7de8c108679c0497334aa2bed2 Mon Sep 17 00:00:00 2001 From: Aayush Khanna Date: Wed, 24 Jun 2026 22:40:45 +0530 Subject: [PATCH 3/5] docs: add docs --- type: pre_commit_static_analysis_report description: Results of running static analysis checks when committing changes. report: - task: lint_filenames status: passed - task: lint_editorconfig status: passed - task: lint_markdown_pkg_readmes status: na - task: lint_markdown_docs status: na - task: lint_markdown status: na - task: lint_package_json status: na - task: lint_repl_help status: na - task: lint_javascript_src status: na - task: lint_javascript_cli status: na - task: lint_javascript_examples status: na - task: lint_javascript_tests status: na - task: lint_javascript_benchmarks status: na - task: lint_python status: na - task: lint_r status: na - task: lint_c_src status: na - task: lint_c_examples status: na - task: lint_c_benchmarks status: na - task: lint_c_tests_fixtures status: na - task: lint_shell status: na - task: lint_typescript_declarations status: passed - task: lint_typescript_tests status: na - task: lint_license_headers status: passed --- --- .../daxpy/include/stdlib/blas/base/daxpy_kernel_inl.h | 2 +- .../@stdlib/blas/base/daxpy/src/daxpy_hwy.cpp | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/lib/node_modules/@stdlib/blas/base/daxpy/include/stdlib/blas/base/daxpy_kernel_inl.h b/lib/node_modules/@stdlib/blas/base/daxpy/include/stdlib/blas/base/daxpy_kernel_inl.h index 73ae04196b3c..31ee899fdfa4 100644 --- a/lib/node_modules/@stdlib/blas/base/daxpy/include/stdlib/blas/base/daxpy_kernel_inl.h +++ b/lib/node_modules/@stdlib/blas/base/daxpy/include/stdlib/blas/base/daxpy_kernel_inl.h @@ -19,7 +19,7 @@ #include #include "stdlib/blas/base/shared.h" - +// define the kernels inline HWY_BEFORE_NAMESPACE(); namespace HWY_NAMESPACE { namespace hn = hwy::HWY_NAMESPACE; diff --git a/lib/node_modules/@stdlib/blas/base/daxpy/src/daxpy_hwy.cpp b/lib/node_modules/@stdlib/blas/base/daxpy/src/daxpy_hwy.cpp index e8ff7c026edb..4a8046fe0974 100644 --- a/lib/node_modules/@stdlib/blas/base/daxpy/src/daxpy_hwy.cpp +++ b/lib/node_modules/@stdlib/blas/base/daxpy/src/daxpy_hwy.cpp @@ -19,7 +19,9 @@ #undef HWY_TARGET_INCLUDE #define HWY_TARGET_INCLUDE "stdlib/blas/base/daxpy_kernel_inl.h" -#include +// note that the order of includes matters here... + +#include // includes HWY_TARGET_INCLUDE once for each target #include // foreach_target.h compiles the kernel for all NON-static targets. @@ -28,19 +30,24 @@ #include "stdlib/blas/base/shared.h" +// do this once for each target #if HWY_ONCE +// export the kernels for dynamic dispatch HWY_EXPORT( daxpy_kernel_simd ); HWY_EXPORT( daxpy_strided_kernel ); +// declare with C linkage without any namespaces extern "C" void daxpy_simd( CBLAS_INT N, double alpha, const double *X, const CBLAS_INT strideX, const CBLAS_INT offsetX, double *Y, const CBLAS_INT strideY, const CBLAS_INT offsetY ) { if ( N <= 0 ) { return; } if ( strideX == 1 && strideY == 1 ) { + // if contiguous memory then use optimised simd kernel HWY_DYNAMIC_DISPATCH( daxpy_kernel_simd )( N, alpha, X, offsetX, Y, offsetY ); } else { + // TODO: see scatter and gather instructions and determine if this can be optimised else call daxpy_ndarray HWY_DYNAMIC_DISPATCH( daxpy_strided_kernel )( N, alpha, X, strideX, offsetX, Y, strideY, offsetY ); } } From f95e3a232cd092f4b76131562b3763d96acf96d9 Mon Sep 17 00:00:00 2001 From: Aayush Khanna Date: Wed, 24 Jun 2026 22:45:21 +0530 Subject: [PATCH 4/5] feat: include order --- type: pre_commit_static_analysis_report description: Results of running static analysis checks when committing changes. report: - task: lint_filenames status: passed - task: lint_editorconfig status: passed - task: lint_markdown_pkg_readmes status: na - task: lint_markdown_docs status: na - task: lint_markdown status: na - task: lint_package_json status: na - task: lint_repl_help status: na - task: lint_javascript_src status: na - task: lint_javascript_cli status: na - task: lint_javascript_examples status: na - task: lint_javascript_tests status: na - task: lint_javascript_benchmarks status: na - task: lint_python status: na - task: lint_r status: na - task: lint_c_src status: na - task: lint_c_examples status: na - task: lint_c_benchmarks status: na - task: lint_c_tests_fixtures status: na - task: lint_shell status: na - task: lint_typescript_declarations status: passed - task: lint_typescript_tests status: na - task: lint_license_headers status: passed --- --- lib/node_modules/@stdlib/blas/base/daxpy/src/daxpy_hwy.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/node_modules/@stdlib/blas/base/daxpy/src/daxpy_hwy.cpp b/lib/node_modules/@stdlib/blas/base/daxpy/src/daxpy_hwy.cpp index 4a8046fe0974..d65ebfc2237d 100644 --- a/lib/node_modules/@stdlib/blas/base/daxpy/src/daxpy_hwy.cpp +++ b/lib/node_modules/@stdlib/blas/base/daxpy/src/daxpy_hwy.cpp @@ -23,13 +23,12 @@ #include // includes HWY_TARGET_INCLUDE once for each target #include +#include "stdlib/blas/base/shared.h" // foreach_target.h compiles the kernel for all NON-static targets. // Now compile it for the static target that was skipped. #include HWY_TARGET_INCLUDE -#include "stdlib/blas/base/shared.h" - // do this once for each target #if HWY_ONCE From e0730790d688ee0bc139adbf04e54242225ffc8f Mon Sep 17 00:00:00 2001 From: Aayush Khanna Date: Wed, 24 Jun 2026 22:47:19 +0530 Subject: [PATCH 5/5] docs: fix comment --- type: pre_commit_static_analysis_report description: Results of running static analysis checks when committing changes. report: - task: lint_filenames status: passed - task: lint_editorconfig status: passed - task: lint_markdown_pkg_readmes status: na - task: lint_markdown_docs status: na - task: lint_markdown status: na - task: lint_package_json status: na - task: lint_repl_help status: na - task: lint_javascript_src status: na - task: lint_javascript_cli status: na - task: lint_javascript_examples status: na - task: lint_javascript_tests status: na - task: lint_javascript_benchmarks status: na - task: lint_python status: na - task: lint_r status: na - task: lint_c_src status: na - task: lint_c_examples status: na - task: lint_c_benchmarks status: na - task: lint_c_tests_fixtures status: na - task: lint_shell status: na - task: lint_typescript_declarations status: passed - task: lint_typescript_tests status: na - task: lint_license_headers status: passed --- --- .../blas/base/daxpy/include/stdlib/blas/base/daxpy_hwy.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/node_modules/@stdlib/blas/base/daxpy/include/stdlib/blas/base/daxpy_hwy.h b/lib/node_modules/@stdlib/blas/base/daxpy/include/stdlib/blas/base/daxpy_hwy.h index 13ac6a6a133b..e25b5eba8a13 100644 --- a/lib/node_modules/@stdlib/blas/base/daxpy/include/stdlib/blas/base/daxpy_hwy.h +++ b/lib/node_modules/@stdlib/blas/base/daxpy/include/stdlib/blas/base/daxpy_hwy.h @@ -1,7 +1,7 @@ /** * @license Apache-2.0 * -* Copyright (c) 2018 The Stdlib Authors. +* Copyright (c) 2026 The Stdlib Authors. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,7 +17,7 @@ */ /** -* Header file containing function declarations for the C interface to the CBLAS Level 1 routine `cblas_daxpy`. +* Header file containing function declarations for the SIMD implementation of the CBLAS Level 1 routine `daxpy_simd`. */ #ifndef STDLIB_BLAS_BASE_DAXPY_HWY_H #define STDLIB_BLAS_BASE_DAXPY_HWY_H