From 698f8c2f01ea549d77d7dc3338a12e04c11057b9 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 17 Apr 2024 14:02:58 +0200 Subject: Adding upstream version 1.64.0+dfsg1. Signed-off-by: Daniel Baumann --- library/stdarch/examples/Cargo.toml | 30 + library/stdarch/examples/connect5.rs | 1272 ++++++++++++++++++++++++++++++++++ library/stdarch/examples/hex.rs | 401 +++++++++++ library/stdarch/examples/wasm.rs | 45 ++ 4 files changed, 1748 insertions(+) create mode 100644 library/stdarch/examples/Cargo.toml create mode 100644 library/stdarch/examples/connect5.rs create mode 100644 library/stdarch/examples/hex.rs create mode 100644 library/stdarch/examples/wasm.rs (limited to 'library/stdarch/examples') diff --git a/library/stdarch/examples/Cargo.toml b/library/stdarch/examples/Cargo.toml new file mode 100644 index 000000000..e2590ed9f --- /dev/null +++ b/library/stdarch/examples/Cargo.toml @@ -0,0 +1,30 @@ +[package] +name = "stdarch_examples" +version = "0.0.0" +authors = [ + "Alex Crichton ", + "Andrew Gallant ", + "Gonzalo Brito Gadeschi ", +] +description = "Examples of the stdarch crate." +edition = "2018" +default-run = "hex" + +[dependencies] +core_arch = { path = "../crates/core_arch" } +std_detect = { path = "../crates/std_detect" } +quickcheck = "0.9" +rand = "0.7" + +[[bin]] +name = "hex" +path = "hex.rs" + +[[bin]] +name = "connect5" +path = "connect5.rs" + +[[example]] +name = "wasm" +crate-type = ["cdylib"] +path = "wasm.rs" diff --git a/library/stdarch/examples/connect5.rs b/library/stdarch/examples/connect5.rs new file mode 100644 index 000000000..1b3325785 --- /dev/null +++ b/library/stdarch/examples/connect5.rs @@ -0,0 +1,1272 @@ +//! Outer-Open Gomoku is a board game which is a enhanced version of connect5 (Gomoku).\ +//! The game is a two-player game which played on a 15x15 Go board.\ +//! Two players take turns placing a move on an empty intersection in this board.\ +//! The winner is the first player to form an unbroken chain of five moves horizontally, vertically, or diagonally.\ +//! Unlike Gomoku, the first move is required to be placed at the two outer rows or columns of this board.\ +//! This program provides an AI playing with Minimax search with alpha-beta pruning which uses +//! patterns on evaluation.\ +//! The avx512 intrinsic can do 32 pattern matching at one time.\ +//! This avx512 is tested with non-avx512 code to verify its correctness.\ +//! +//! On Intel i7-7800x using single thread with fixed AVX-512 clock at 4.0GHz, the avx512 is speed up about 9x.\ +//! The average time for each move in the avx512 is around 14.00s ± 1.31s and in the non-avx512 +//! is 129.02s ± 4.96s.\ +//! On Intel Tiger Lake i7-1165G7, the avx512 is around 11.11s ± 1.31s. +//! +//! Pattern Matching\ +//! Use 512-bit to present the board state. The location 0 is top left.\ +//! 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15\ +//! 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31\ +//! ...\ +//! Pattern "OOOOO" is matching through "0 1 2 3 4", "1 2 3 4 5", ...\ +//! Using avx512, "0 1 2 3 4", "16 17 18 19 20", ... can be matched simultaneously.\ +//! +//! //! You can test out this program via: +//! +//! cargo +nightly run --release --bin connect5 +//! +//! You should see a game self-playing. In the end of the game, it shows the average time for +//! each move. + +#![feature(stdsimd, avx512_target_feature)] +#![feature(stmt_expr_attributes)] + +use rand::seq::SliceRandom; +use rand::thread_rng; + +use std::cmp; +use std::time::Instant; + +#[cfg(target_arch = "x86")] +use {core_arch::arch::x86::*, std_detect::is_x86_feature_detected}; +#[cfg(target_arch = "x86_64")] +use {core_arch::arch::x86_64::*, std_detect::is_x86_feature_detected}; + +// types + +#[derive(Clone, Copy, PartialEq, Eq)] +pub enum Color { + Black = 0, + White = 1, + Empty = 2, + Border = 3, +} + +type Square = i32; +type Move = i32; +type Side = Color; + +// constants + +const FILE_SIZE: i32 = 15; +const RANK_SIZE: i32 = 15; +const SQUARE_SIZE: i32 = (FILE_SIZE + 1) * (FILE_SIZE + 4) + 16 + 4; + +const EVAL_INF: i32 = FILE_SIZE * RANK_SIZE * 100; +const MOVE_NONE: Move = -1; +const SCORE_NONE: i32 = -EVAL_INF - 1; + +/// DIRECTION 0: left to right\ +/// DIRECTION 1: top to bottom\ +/// DIRECTION 2: top left to bottom right\ +/// DIRECTION 3: top right to bottom left +#[rustfmt::skip] +const DIRECTION: [[i32; 5]; 4] = [ [1, 2, 3, 4, 5], + [1 * (FILE_SIZE + 1), 2 * (FILE_SIZE + 1), 3 * (FILE_SIZE + 1), 4 * (FILE_SIZE + 1), 5 * (FILE_SIZE + 1)], + [1 * (FILE_SIZE + 2), 2 * (FILE_SIZE + 2), 3 * (FILE_SIZE + 2), 4 * (FILE_SIZE + 2), 5 * (FILE_SIZE + 2)], + [1 * (FILE_SIZE + 0), 2 * (FILE_SIZE + 0), 3 * (FILE_SIZE + 0), 4 * (FILE_SIZE + 0), 5 * (FILE_SIZE + 0)]]; + +/// A table to encode each location to a value in bit 31-0 in the bitboard for 4 direction +#[rustfmt::skip] +const MAPMOVEVALUE: [[i32; 239]; 4] = [ [// Direction 0 + 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17, 0, + 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17, 0, + 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17, 0, + 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17, 0, + 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17, 0, + 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17, 0, + 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17, 0, + 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17, 0, + 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17, 0, + 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17, 0, + 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17, 0, + 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17, 0, + 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17, 0, + 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17, 0, + 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17], + [// Direction 1 + 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 0, + 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 0, + 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 0, + 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 0, + 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 0, + 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 0, + 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 0, + 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 0, + 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 0, + 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 0, + 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 0, + 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 0, + 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 0, + 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 0, + 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17], + [// Direction 2 + 1<<15, 1<<15, 1<<15, 1<<15, 1<<15, 1<<15, 1<<15, 1<<15, 1<<15, 1<<15, 1<<15, 0, 0, 0, 0, 0, + 1<<15, 1<<14, 1<<14, 1<<14, 1<<14, 1<<14, 1<<14, 1<<14, 1<<14, 1<<14, 1<<14, 1<<14, 0, 0, 0, 0, + 1<<15, 1<<14, 1<<13, 1<<13, 1<<13, 1<<13, 1<<13, 1<<13, 1<<13, 1<<13, 1<<13, 1<<13, 1<<13, 0, 0, 0, + 1<<15, 1<<14, 1<<13, 1<<12, 1<<12, 1<<12, 1<<12, 1<<12, 1<<12, 1<<12, 1<<12, 1<<12, 1<<12, 1<<12, 0, 0, + 1<<15, 1<<14, 1<<13, 1<<12, 1<<11, 1<<11, 1<<11, 1<<11, 1<<11, 1<<11, 1<<11, 1<<11, 1<<11, 1<<11, 1<<11, 0, + 1<<15, 1<<14, 1<<13, 1<<12, 1<<11, 1<<10, 1<<10, 1<<10, 1<<10, 1<<10, 1<<10, 1<<10, 1<<10, 1<<10, 1<<10, 0, + 1<<9, 1<<14, 1<<13, 1<<12, 1<<11, 1<<10, 1<<9, 1<<9, 1<<9, 1<<9, 1<<9, 1<<9, 1<<9, 1<<9, 1<<9, 0, + 1<<8, 1<<8, 1<<13, 1<<12, 1<<11, 1<<10, 1<<9, 1<<8, 1<<8, 1<<8, 1<<8, 1<<8, 1<<8, 1<<8, 1<<8, 0, + 1<<7, 1<<7, 1<<7, 1<<12, 1<<11, 1<<10, 1<<9, 1<<8, 1<<7, 1<<7, 1<<7, 1<<7, 1<<7, 1<<7, 1<<7, 0, + 1<<6, 1<<6, 1<<6, 1<<6, 1<<11, 1<<10, 1<<9, 1<<8, 1<<7, 1<<6, 1<<6, 1<<6, 1<<6, 1<<6, 1<<6, 0, + 1<<5, 1<<5, 1<<5, 1<<5, 1<<5, 1<<10, 1<<9, 1<<8, 1<<7, 1<<6, 1<<5, 1<<5, 1<<5, 1<<5, 1<<5, 0, + 0, 1<<4, 1<<4, 1<<4, 1<<4, 1<<4, 1<<9, 1<<8, 1<<7, 1<<6, 1<<5, 1<<4, 1<<4, 1<<4, 1<<4, 0, + 0, 0, 1<<3, 1<<3, 1<<3, 1<<3, 1<<3, 1<<8, 1<<7, 1<<6, 1<<5, 1<<4, 1<<3, 1<<3, 1<<3, 0, + 0, 0, 0, 1<<2, 1<<2, 1<<2, 1<<2, 1<<2, 1<<7, 1<<6, 1<<5, 1<<4, 1<<3, 1<<2, 1<<2, 0, + 0, 0, 0, 0, 1<<1, 1<<1, 1<<1, 1<<1, 1<<1, 1<<6, 1<<5, 1<<4, 1<<3, 1<<2, 1<<1], + [// Direction 3 + 0, 0, 0, 0, 1<<15, 1<<15, 1<<15, 1<<15, 1<<15, 1<<15, 1<<15, 1<<15, 1<<15, 1<<15, 1<<15, 0, + 0, 0, 0, 1<<14, 1<<14, 1<<14, 1<<14, 1<<14, 1<<14, 1<<14, 1<<14, 1<<14, 1<<14, 1<<14, 1<<15, 0, + 0, 0, 1<<13, 1<<13, 1<<13, 1<<13, 1<<13, 1<<13, 1<<13, 1<<13, 1<<13, 1<<13, 1<<13, 1<<14, 1<<15, 0, + 0, 1<<12, 1<<12, 1<<12, 1<<12, 1<<12, 1<<12, 1<<12, 1<<12, 1<<12, 1<<12, 1<<12, 1<<13, 1<<14, 1<<15, 0, + 1<<11, 1<<11, 1<<11, 1<<11, 1<<11, 1<<11, 1<<11, 1<<11, 1<<11, 1<<11, 1<<11, 1<<12, 1<<13, 1<<14, 1<<15, 0, + 1<<10, 1<<10, 1<<10, 1<<10, 1<<10, 1<<10, 1<<10, 1<<10, 1<<10, 1<<10, 1<<11, 1<<12, 1<<13, 1<<14, 1<<15, 0, + 1<<9, 1<<9, 1<<9, 1<<9, 1<<9, 1<<9, 1<<9, 1<<9, 1<<9, 1<<10, 1<<11, 1<<12, 1<<13, 1<<14, 1<<9, 0, + 1<<8, 1<<8, 1<<8, 1<<8, 1<<8, 1<<8, 1<<8, 1<<8, 1<<9, 1<<10, 1<<11, 1<<12, 1<<13, 1<<8, 1<<8, 0, + 1<<7, 1<<7, 1<<7, 1<<7, 1<<7, 1<<7, 1<<7, 1<<8, 1<<9, 1<<10, 1<<11, 1<<12, 1<<7, 1<<7, 1<<7, 0, + 1<<6, 1<<6, 1<<6, 1<<6, 1<<6, 1<<6, 1<<7, 1<<8, 1<<9, 1<<10, 1<<11, 1<<6, 1<<6, 1<<6, 1<<6, 0, + 1<<5, 1<<5, 1<<5, 1<<5, 1<<5, 1<<6, 1<<7, 1<<8, 1<<9, 1<<10, 1<<5, 1<<5, 1<<5, 1<<5, 1<<5, 0, + 1<<4, 1<<4, 1<<4, 1<<4, 1<<5, 1<<6, 1<<7, 1<<8, 1<<9, 1<<4, 1<<4, 1<<4, 1<<4, 1<<4, 0, 0, + 1<<3, 1<<3, 1<<3, 1<<4, 1<<5, 1<<6, 1<<7, 1<<8, 1<<3, 1<<3, 1<<3, 1<<3, 1<<3, 0, 0, 0, + 1<<2, 1<<2, 1<<3, 1<<4, 1<<5, 1<<6, 1<<7, 1<<2, 1<<2, 1<<2, 1<<2, 1<<2, 0, 0, 0, 0, + 1<<1, 1<<2, 1<<3, 1<<4, 1<<5, 1<<6, 1<<1, 1<<1, 1<<1, 1<<1, 1<<1, 0, 0, 0, 0] + ]; + +/// A table to encode each location to an index in the bitboard for 4 direction +#[rustfmt::skip] +const MAPMOVEIDX: [[i32; 239]; 4] = [ [// Direction 0 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 0, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 0, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 0, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 0, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 0, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 0, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 0, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14], + [// Direction 1 + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], + [// Direction 2 + 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, + 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, + 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, + 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, + 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, + 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, + 1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 0, + 2, 1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 0, + 3, 2, 1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 0, + 4, 3, 2, 1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 0, + 5, 4, 3, 2, 1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 0, + 0, 5, 4, 3, 2, 1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 0, + 0, 0, 5, 4, 3, 2, 1, 15, 14, 13, 12, 11, 10, 9, 8, 0, + 0, 0, 0, 5, 4, 3, 2, 1, 15, 14, 13, 12, 11, 10, 9, 0, + 0, 0, 0, 0, 5, 4, 3, 2, 1, 15, 14, 13, 12, 11, 10], + [// Direction 3 + 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, + 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, + 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, + 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, + 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 0, + 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 0, + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 0, + 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 0, + 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 0, + 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 0, 0, + 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 0, 0, 0, + 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 0, 0, 0, 0, + 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 0, 0, 0, 0] + ]; + +// structures + +/// Use one-dimensional array to store the board state. The location 0 is top left.\ +/// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15\ +/// 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31\ +/// ... \ +/// position 15, 31, ... are Borders.\ +/// position 0 is file 0, rank 0.\ +/// position 17 is file 1, rank 1.\ +/// +/// Use a three-dimensional array to store the bitboard.\ +/// The first dimension is color: Black, White and Empty.\ +/// The second and third one are 2 x 512-bit. Direction 0 and 2 use the first 512-bit. Direction 1 and +/// 3 use the second 512-bit.\ +/// Each 512-bit is a 32-bit x 16 array. Direction 0 and 1 store at bit 31-16 and Direction 2 and 3 store at bit 15-0. + +pub struct Pos { + // position + state: [Color; SQUARE_SIZE as usize], + p_turn: Side, + bitboard: [[[i32; 16]; 2]; 3], +} + +impl Pos { + pub fn init(&mut self) { + // starting position + // Set up the Border + for i in 0..SQUARE_SIZE as usize { + self.state[i] = Color::Border; + } + + // In the beginning, all is Empty + for rk in 0..RANK_SIZE { + for fl in 0..FILE_SIZE { + let sq: Square = square_make(fl, rk); + self.state[sq as usize] = Color::Empty; + } + } + + // first move is Black + self.p_turn = Color::Black; + + let black = Color::Black as usize; + let white = Color::White as usize; + let empty = Color::Empty as usize; + + // set up the corresponding bitboard + for i in 0..2 { + for j in 0..16 { + self.bitboard[black][i][j] = 0; + self.bitboard[white][i][j] = 0; + self.bitboard[empty][i][j] = 0; + } + } + + for i in 0..2 { + // use bit 31-16 to store direction 0 and 1 + #[rustfmt::skip] + for j in 0..FILE_SIZE as usize { + self.bitboard[empty][i][j] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19)|(1<<18)|(1<<17); + } + } + + // use bit 15-0 to store direction 2 and 3. There are 21 for each one. We combine row1 and row16, row2 and row17, row3 and row18, row4 and row19, and row 5 and row20 + #[rustfmt::skip] + for i in 0..2 { + self.bitboard[empty][i][0] |= (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11); //row 0 + self.bitboard[empty][i][1] |= (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10)/*row1*/|(1<<9)|(1<<8)|(1<<7)|(1<<6)|(1<<5)|(1<<4)|(1<<3)|(1<<2)|(1<<1);//row16 + self.bitboard[empty][i][2] |= (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10)|(1<<9)/*row2*/|(1<<8)|(1<<7)|(1<<6)|(1<<5)|(1<<4)|(1<<3)|(1<<2)|(1<<1);//row17 + self.bitboard[empty][i][3] |= (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10)|(1<<9)|(1<<8)/*row3*/|(1<<7)|(1<<6)|(1<<5)|(1<<4)|(1<<3)|(1<<2)|(1<<1);//row18 + self.bitboard[empty][i][4] |= (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10)|(1<<9)|(1<<8)|(1<<7)/*row4*/|(1<<6)|(1<<5)|(1<<4)|(1<<3)|(1<<2)|(1<<1);//row19 + self.bitboard[empty][i][5] |= (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10)|(1<<9)|(1<<8)|(1<<7)|(1<<6)/*row5*/|(1<<5)|(1<<4)|(1<<3)|(1<<2)|(1<<1);//row20 + self.bitboard[empty][i][6] |= (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10)|(1<<9)|(1<<8)|(1<<7)|(1<<6)|(1<<5);//row6 + self.bitboard[empty][i][7] |= (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10)|(1<<9)|(1<<8)|(1<<7)|(1<<6)|(1<<5)|(1<<4);//row7 + self.bitboard[empty][i][8] |= (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10)|(1<<9)|(1<<8)|(1<<7)|(1<<6)|(1<<5)|(1<<4)|(1<<3);//row8 + self.bitboard[empty][i][9] |= (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10)|(1<<9)|(1<<8)|(1<<7)|(1<<6)|(1<<5)|(1<<4)|(1<<3)|(1<<2);//row9 + self.bitboard[empty][i][10] |= (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10)|(1<<9)|(1<<8)|(1<<7)|(1<<6)|(1<<5)|(1<<4)|(1<<3)|(1<<2)|(1<<1);//row10 + self.bitboard[empty][i][11] |= (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10)|(1<<9)|(1<<8)|(1<<7)|(1<<6)|(1<<5)|(1<<4)|(1<<3)|(1<<2);//row11 + self.bitboard[empty][i][12] |= (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10)|(1<<9)|(1<<8)|(1<<7)|(1<<6)|(1<<5)|(1<<4)|(1<<3);//row12 + self.bitboard[empty][i][13] |= (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10)|(1<<9)|(1<<8)|(1<<7)|(1<<6)|(1<<5)|(1<<4);//row13 + self.bitboard[empty][i][14] |= (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10)|(1<<9)|(1<<8)|(1<<7)|(1<<6)|(1<<5);//row14 + self.bitboard[empty][i][15] |= (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10)|(1<<9)|(1<<8)|(1<<7)|(1<<6);//row15 + } + } + + pub fn do_move(&mut self, mv: Move) { + let atk: Side = self.p_turn; + let def: Side = side_opp(atk); + + let mv = mv as usize; + let black = Color::Black as usize; + let white = Color::White as usize; + let empty = Color::Empty as usize; + + match self.p_turn { + Color::Black => { + self.state[mv as usize] = Color::Black; + // update black move and remove empty move in bitboard + self.bitboard[black][0][MAPMOVEIDX[0][mv] as usize] |= MAPMOVEVALUE[0][mv]; + self.bitboard[empty][0][MAPMOVEIDX[0][mv] as usize] ^= MAPMOVEVALUE[0][mv]; + self.bitboard[black][1][MAPMOVEIDX[1][mv] as usize] |= MAPMOVEVALUE[1][mv]; + self.bitboard[empty][1][MAPMOVEIDX[1][mv] as usize] ^= MAPMOVEVALUE[1][mv]; + self.bitboard[black][0][MAPMOVEIDX[2][mv] as usize] |= MAPMOVEVALUE[2][mv]; + self.bitboard[empty][0][MAPMOVEIDX[2][mv] as usize] ^= MAPMOVEVALUE[2][mv]; + self.bitboard[black][1][MAPMOVEIDX[3][mv] as usize] |= MAPMOVEVALUE[3][mv]; + self.bitboard[empty][1][MAPMOVEIDX[3][mv] as usize] ^= MAPMOVEVALUE[3][mv]; + } + Color::White => { + self.state[mv as usize] = Color::White; + // update white move and remove empty move in bitboard + self.bitboard[white][0][MAPMOVEIDX[0][mv] as usize] |= MAPMOVEVALUE[0][mv]; + self.bitboard[empty][0][MAPMOVEIDX[0][mv] as usize] ^= MAPMOVEVALUE[0][mv]; + self.bitboard[white][1][MAPMOVEIDX[1][mv] as usize] |= MAPMOVEVALUE[1][mv]; + self.bitboard[empty][1][MAPMOVEIDX[1][mv] as usize] ^= MAPMOVEVALUE[1][mv]; + self.bitboard[white][0][MAPMOVEIDX[2][mv] as usize] |= MAPMOVEVALUE[2][mv]; + self.bitboard[empty][0][MAPMOVEIDX[2][mv] as usize] ^= MAPMOVEVALUE[2][mv]; + self.bitboard[white][1][MAPMOVEIDX[3][mv] as usize] |= MAPMOVEVALUE[3][mv]; + self.bitboard[empty][1][MAPMOVEIDX[3][mv] as usize] ^= MAPMOVEVALUE[3][mv]; + } + _ => panic! {}, + } + + self.p_turn = def; + } + + fn turn(&self) -> Side { + self.p_turn + } + + pub fn can_play(&self, from: Square) -> bool { + if self.state[from as usize] == Color::Empty { + true + } else { + false + } + } +} + +pub struct List { + // legal move list + p_move: [Move; (FILE_SIZE * RANK_SIZE) as usize], + p_size: i32, +} + +/// Use List to store legal moves. +impl List { + pub fn clear(&mut self) { + self.p_size = 0; + } + + pub fn add(&mut self, mv: Move) { + self.p_move[self.p_size as usize] = mv; + self.p_size += 1; + } + + pub fn size(&self) -> i32 { + self.p_size + } + + pub fn shuffle(&mut self) { + let mut rng = thread_rng(); + let num = self.p_size; + let mut new_move: Vec = vec![]; + + for x in 0..(num as usize) { + new_move.push(self.p_move[x]); + } + + new_move.shuffle(&mut rng); + + for x in 0..(self.p_size as usize) { + self.p_move[x] = new_move[x]; + } + } +} + +// functions + +fn square_make(fl: i32, rk: i32) -> Square { + rk * (FILE_SIZE + 1) + fl +} + +fn side_opp(sd: Side) -> Side { + match sd { + Side::White => Side::Black, + Side::Black => Side::White, + _ => panic!(""), + } +} + +fn pos_is_winner(pos: &Pos) -> bool { + let current_side = side_opp(pos.p_turn); + check_pattern5(&pos, current_side) +} + +fn pos_is_draw(pos: &Pos) -> bool { + let mut found: bool = true; + + for rk in 0..RANK_SIZE { + for fl in 0..FILE_SIZE { + let sq: Square = square_make(fl, rk); + if pos.can_play(sq) { + found = false; + break; + } + + if found == false { + break; + } + } + } + + let mut out: bool = false; + if found == true && !pos_is_winner(pos) { + out = true; + } + + out +} + +#[target_feature(enable = "avx512f,avx512bw")] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +unsafe fn pos_is_draw_avx512(pos: &Pos) -> bool { + let empty = Color::Empty as usize; + + let board0org = _mm512_loadu_epi32(&pos.bitboard[empty][0][0]); + + let answer = _mm512_set1_epi32(0); + + // if all empty is 0, all board is filled. + let temp_mask = _mm512_mask_cmpneq_epi32_mask(0b11111111_11111111, answer, board0org); + + if _popcnt32(temp_mask as i32) == 0 && !pos_is_winner_avx512(pos) { + return true; + } else { + return false; + } +} + +fn pos_is_end(pos: &Pos) -> bool { + if pos_is_winner(pos) || pos_is_draw(pos) { + true + } else { + false + } +} + +fn pos_disp(pos: &Pos) { + for rk in 0..RANK_SIZE { + for fl in 0..FILE_SIZE { + let sq: Square = square_make(fl, rk); + + match pos.state[sq as usize] { + Color::Black => print!("# "), + Color::White => print!("O "), + Color::Empty => print!("- "), + Color::Border => print!("| "), + } + } + + println!(""); + } + + match pos.turn() { + Color::Black => println!("black to play"), + Color::White => println!("white to play"), + _ => panic!(), + } +} + +fn gen_moves(list: &mut List, pos: &Pos) { + list.clear(); + + for rk in 0..RANK_SIZE { + for fl in 0..FILE_SIZE { + let sq: Square = square_make(fl, rk); + if pos.can_play(sq) { + list.add(sq); + } + } + } +} + +/// AI: use Minimax search with alpha-beta pruning +fn search(pos: &Pos, alpha: i32, beta: i32, depth: i32, _ply: i32) -> i32 { + assert!(-EVAL_INF <= alpha && alpha < beta && beta <= EVAL_INF); + // leaf? + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + if is_x86_feature_detected!("avx512bw") { + unsafe { + if pos_is_winner_avx512(&pos) { + return -EVAL_INF + _ply; + } + + if pos_is_draw_avx512(&pos) { + return 0; + } + } + } else { + if pos_is_winner(&pos) { + return -EVAL_INF + _ply; + } + + if pos_is_draw(&pos) { + return 0; + } + } + } + + #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] + { + if pos_is_winner(&pos) { + return -EVAL_INF + _ply; + } + + if pos_is_draw(&pos) { + return 0; + } + } + + if depth == 0 { + return eval(&pos, _ply); + } + + let p_move_new: [Move; (FILE_SIZE * RANK_SIZE) as usize] = + [0; (FILE_SIZE * RANK_SIZE) as usize]; + + let mut list = List { + p_move: p_move_new, + p_size: 0, + }; + + let mut bm: Move = MOVE_NONE; + let mut bs: i32 = SCORE_NONE; + + gen_moves(&mut list, &pos); + + // move loop + + if _ply == 0 { + list.shuffle(); + } + + for i in 0..list.size() { + if bs < beta { + let mv: Move = list.p_move[i as usize]; + + let mut new_pos = Pos { + state: pos.state, + p_turn: pos.p_turn, + bitboard: pos.bitboard, + }; + + new_pos.do_move(mv); + + let sc: i32 = -search(&new_pos, -beta, -cmp::max(alpha, bs), depth - 1, _ply + 1); + + if sc > bs { + bm = mv; + bs = sc; + } + } + } + + assert!(bm != MOVE_NONE); + assert!(bs >= -EVAL_INF && bs <= EVAL_INF); + + if _ply == 0 { + bm + } else { + bs + } //best move at the root node, best score elsewhere +} + +/// Evaluation function: give different scores to different patterns after a fixed depth. +fn eval(pos: &Pos, _ply: i32) -> i32 { + let atk: Side = pos.turn(); + let def: Side = side_opp(atk); + + // check if opp has live4 which will win playing next move + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + if is_x86_feature_detected!("avx512bw") { + unsafe { + if check_patternlive4_avx512(&pos, def) { + return -4096; + } + } + } else { + if check_patternlive4(&pos, def) { + return -4096; + } + } + } + + #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] + { + if check_patternlive4(&pos, def) { + return -4096; + } + } + + // check if self has live4 which will win playing next move + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + if is_x86_feature_detected!("avx512bw") { + unsafe { + if check_patternlive4_avx512(&pos, atk) { + return 2560; + } + } + } else { + if check_patternlive4(&pos, atk) { + return 2560; + } + } + } + + #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] + { + if check_patternlive4(&pos, atk) { + return 2560; + } + } + + // check if self has dead4 which will win playing next move + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + if is_x86_feature_detected!("avx512bw") { + unsafe { + if check_patterndead4_avx512(&pos, atk) > 0 { + return 2560; + } + } + } else { + if check_patterndead4(&pos, atk) > 0 { + return 2560; + } + } + } + + #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] + { + if check_patterndead4(&pos, atk) > 0 { + return 2560; + } + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + if is_x86_feature_detected!("avx512bw") { + unsafe { + let n_c4: i32 = check_patterndead4_avx512(&pos, def); + let n_c3: i32 = check_patternlive3_avx512(&pos, def); + + // check if opp has 2 dead4 which will win playing next move + if n_c4 > 1 { + return -2048; + } + + // check if opp has a dead 4 and live 3 which will win playing the next two move + if n_c4 == 1 && n_c3 > 0 { + return -2048; + } + + if check_patternlive3_avx512(&pos, atk) > 1 { + return 2560; + } + + // check if opp has 2 live3 which will win playing the next two move + if n_c3 > 1 { + return -2048; + } + } + } else { + let n_c4: i32 = check_patterndead4(&pos, def); + let n_c3: i32 = check_patternlive3(&pos, def); + + // check if opp has 2 dead4 which will win playing next move + if n_c4 > 1 { + return -2048; + } + + // check if opp has a dead 4 and live 3 which will win playing the next two move + if n_c4 == 1 && n_c3 > 0 { + return -2048; + } + + // check if self has 2 live3 which will win playing the next two move + if check_patternlive3(&pos, atk) > 1 { + return 2560; + } + + // check if opp has 2 live3 which will win playing the next two move + if n_c3 > 1 { + return -2048; + } + } + } + + #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] + { + let n_c4: i32 = check_patterndead4(&pos, def); + let n_c3: i32 = check_patternlive3(&pos, def); + + // check if opp has 2 dead4 which will win playing next move + if n_c4 > 1 { + return -2048; + } + + // check if opp has a dead 4 and live 3 which will win playing the next two move + if n_c4 == 1 && n_c3 > 0 { + return -2048; + } + + // check if self has 2 live3 which will win playing the next two move + if check_patternlive3(&pos, atk) > 1 { + return 2560; + } + + // check if opp has 2 live3 which will win playing the next two move + if n_c3 > 1 { + return -2048; + } + } + + 0 +} + +/// Check OOOOO +fn check_pattern5(pos: &Pos, sd: Side) -> bool { + let mut n: i32 = 0; + + for rk in 0..RANK_SIZE { + for fl in 0..FILE_SIZE { + let sq: Square = square_make(fl, rk); + + for pat in 0..4 { + let idx0 = sq; + let idx1 = sq + DIRECTION[pat][0]; + let idx2 = sq + DIRECTION[pat][1]; + let idx3 = sq + DIRECTION[pat][2]; + let idx4 = sq + DIRECTION[pat][3]; + + let val0 = pos.state[idx0 as usize]; + let val1 = pos.state[idx1 as usize]; + let val2 = pos.state[idx2 as usize]; + let val3 = pos.state[idx3 as usize]; + let val4 = pos.state[idx4 as usize]; + + #[rustfmt::skip] + if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == sd { n += 1; } + } + } + } + + if n > 0 { + true + } else { + false + } +} + +/// Check -OOOO- +fn check_patternlive4(pos: &Pos, sd: Side) -> bool { + let mut n: i32 = 0; + + for rk in 0..RANK_SIZE { + for fl in 0..FILE_SIZE { + let sq: Square = square_make(fl, rk); + + for pat in 0..4 { + let idx0 = sq; + let idx1 = sq + DIRECTION[pat][0]; + let idx2 = sq + DIRECTION[pat][1]; + let idx3 = sq + DIRECTION[pat][2]; + let idx4 = sq + DIRECTION[pat][3]; + let idx5 = sq + DIRECTION[pat][4]; + + let val0 = pos.state[idx0 as usize]; + let val1 = pos.state[idx1 as usize]; + let val2 = pos.state[idx2 as usize]; + let val3 = pos.state[idx3 as usize]; + let val4 = pos.state[idx4 as usize]; + let val5 = pos.state[idx5 as usize]; + + #[rustfmt::skip] + if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd && val5 == Color::Empty { n += 1; } + } + } + } + + if n > 0 { + true + } else { + false + } +} + +/// Check OOOO-, OOO-O, OO-OO, O-OOO, -OOOO +fn check_patterndead4(pos: &Pos, sd: Side) -> i32 { + let mut n: i32 = 0; + + for rk in 0..RANK_SIZE { + for fl in 0..FILE_SIZE { + let sq: Square = square_make(fl, rk); + + for dir in 0..4 { + let idx0 = sq; + let idx1 = sq + DIRECTION[dir][0]; + let idx2 = sq + DIRECTION[dir][1]; + let idx3 = sq + DIRECTION[dir][2]; + let idx4 = sq + DIRECTION[dir][3]; + + let val0 = pos.state[idx0 as usize]; + let val1 = pos.state[idx1 as usize]; + let val2 = pos.state[idx2 as usize]; + let val3 = pos.state[idx3 as usize]; + let val4 = pos.state[idx4 as usize]; + + #[rustfmt::skip] + if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n += 1; } + #[rustfmt::skip] + if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { n += 1; } + #[rustfmt::skip] + if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { n += 1; } + #[rustfmt::skip] + if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { n += 1; } + #[rustfmt::skip] + if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { n += 1; } + } + } + } + + n +} + +/// Check -OOO-, -OO-O-, -O-OO-
+fn check_patternlive3(pos: &Pos, sd: Side) -> i32 { + let mut n: i32 = 0; + + for rk in 0..RANK_SIZE { + for fl in 0..FILE_SIZE { + let sq: Square = square_make(fl, rk); + + for dir in 0..4 { + let idx0 = sq; + let idx1 = sq + DIRECTION[dir][0]; + let idx2 = sq + DIRECTION[dir][1]; + let idx3 = sq + DIRECTION[dir][2]; + let idx4 = sq + DIRECTION[dir][3]; + let idx5 = sq + DIRECTION[dir][4]; + + let val0 = pos.state[idx0 as usize]; + let val1 = pos.state[idx1 as usize]; + let val2 = pos.state[idx2 as usize]; + let val3 = pos.state[idx3 as usize]; + let val4 = pos.state[idx4 as usize]; + let val5 = pos.state[idx5 as usize]; + + #[rustfmt::skip] + if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n +=1; } + #[rustfmt::skip] + if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { n += 1; } + #[rustfmt::skip] + if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { n += 1; } + } + } + } + + n +} + +#[target_feature(enable = "avx512f,avx512bw")] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +unsafe fn pos_is_winner_avx512(pos: &Pos) -> bool { + let current_side = side_opp(pos.p_turn); + let coloridx = current_side as usize; + + let board0org: [__m512i; 2] = [ + _mm512_loadu_epi32(&pos.bitboard[coloridx][0][0]), + _mm512_loadu_epi32(&pos.bitboard[coloridx][1][0]), + ]; // load states from bitboard + + #[rustfmt::skip] + let answer = _mm512_set1_epi16((1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)); // an unbroken chain of five moves + + // use Mask to filter out which data is not processed. + // 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 + // 1 x x x x _ _ _ _ _ _ _ _ _ _ _ 0 x o x o x 0 0 0 0 0 0 0 0 0 0 0 + // 2 x _ _ _ _ o _ x o _ _ _ _ _ _ 0 x o _ _ _ _ _| x x o o o x x _ _ + // . ... + // . ... + // . ... + // 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 x o x o o o o o o o 0 0 0 0 0 0 + // + // answer_mask[0]: 01_11..............: "0" is in row 16 and column 1-16. + // There is no data to match (x = black, o = white, _ = empty, 0 = no data). + // + // + // Then, shift one space left. + // 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 + // 1 x x x _ _ _ _ _ _ _ _ _ _ _ 0 x o x o x 0 0 0 0 0 0 0 0 0 0 0 0 + // . ... + // . ... + // . ... + // 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 x o x o o o o o o o 0 0 0 0 0 0 0 + // answer_mask[1]: ................_10: "0" is in row 1 and column 17-32; + // There is no enough data to match (o x o x but we want to match o o o o o). + // + // answer_mask[2]: mix 2 data together (column 17-23 and column 24-32). Using Mask to make it match correctly. + // For example, column 23,24,25,26,27 is not a pattern and 24,25,26,27,28 is a pattern. + // That is why some mask bits are set to 0 from answer_mask[2] to answer_mask[10]. + + #[rustfmt::skip] + let answer_mask: [__mmask32; 11] = [0b01_11_11_11_11_11_11_11_11_11_11_11_11_11_11_11, + 0b01_11_11_11_11_11_11_11_11_11_11_11_11_11_11_10, + 0b01_11_11_11_11_11_11_11_11_11_11_11_11_11_10_10, + 0b01_11_11_11_11_11_11_11_11_11_11_11_11_10_10_10, + 0b01_11_11_11_11_11_11_11_11_11_11_11_10_10_10_10, + 0b01_11_11_11_11_11_11_11_11_11_11_10_10_10_10_10, + 0b00_11_11_11_11_11_11_11_11_11_10_10_10_10_11_10, + 0b00_10_11_11_11_11_11_11_11_10_10_10_10_11_11_10, + 0b00_10_10_11_11_11_11_11_10_10_10_10_11_11_11_10, + 0b00_10_10_10_11_11_11_10_10_10_10_11_11_11_11_10, + 0b00_10_10_10_10_11_10_10_10_10_11_11_11_11_11_10]; + let mut count_match: i32 = 0; + + for dir in 0..2 { + // direction 0 and 1 + let mut board0 = board0org[dir]; + let boardf = _mm512_and_si512(answer, board0); + let temp_mask = _mm512_mask_cmpeq_epi16_mask(answer_mask[0], answer, boardf); + count_match += _popcnt32(temp_mask as i32); + + for i in 1..11 { + // OOOOOOOOOOO----, the last 4 "-" cannot make an unbroken chain of five. + board0 = _mm512_slli_epi32(board0, 1); // shift one space left + let boardf = _mm512_and_si512(answer, board0); // focus on the pattern + let temp_mask = _mm512_mask_cmpeq_epi16_mask(answer_mask[i], answer, boardf); // see if it matches the pattern + count_match += _popcnt32(temp_mask as i32); + } + } + + if count_match > 0 { + return true; + } else { + return false; + } +} + +#[target_feature(enable = "avx512f,avx512bw")] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +unsafe fn check_patternlive4_avx512(pos: &Pos, sd: Side) -> bool { + let coloridx = sd as usize; + let emptyidx = Color::Empty as usize; + + #[rustfmt::skip] + let answer_color = _mm512_set1_epi16( (1<<14)|(1<<13)|(1<<12)|(1<<11) ); + #[rustfmt::skip] + let answer_empty = _mm512_set1_epi16( (1<<15)| (1<<10) ); + #[rustfmt::skip] + let answer = _mm512_set1_epi16( (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10) ); + + #[rustfmt::skip] + let answer_mask: [__mmask32; 10] = [0b01_11_11_11_11_11_11_11_11_11_11_11_11_11_11_10, + 0b01_11_11_11_11_11_11_11_11_11_11_11_11_11_10_10, + 0b01_11_11_11_11_11_11_11_11_11_11_11_11_10_10_10, + 0b01_11_11_11_11_11_11_11_11_11_11_11_10_10_10_10, + 0b01_11_11_11_11_11_11_11_11_11_11_10_10_10_10_10, + 0b00_11_11_11_11_11_11_11_11_11_10_10_10_10_10_10, + 0b00_10_11_11_11_11_11_11_11_10_10_10_10_10_11_10, + 0b00_10_10_11_11_11_11_11_10_10_10_10_10_11_11_10, + 0b00_10_10_10_11_11_11_10_10_10_10_10_11_11_11_10, + 0b00_10_10_10_10_11_10_10_10_10_10_11_11_11_11_10]; + let board0org: [__m512i; 2] = [ + _mm512_loadu_epi32(&pos.bitboard[coloridx][0][0]), + _mm512_loadu_epi32(&pos.bitboard[coloridx][1][0]), + ]; + let board1org: [__m512i; 2] = [ + _mm512_loadu_epi32(&pos.bitboard[emptyidx][0][0]), + _mm512_loadu_epi32(&pos.bitboard[emptyidx][1][0]), + ]; + + let mut count_match: i32 = 0; + + for dir in 0..2 { + let mut board0 = board0org[dir]; + let mut board1 = board1org[dir]; + + let boardf1 = _mm512_and_si512(answer_color, board0); + let boardf2 = _mm512_and_si512(answer_empty, board1); + let boardf = _mm512_or_si512(boardf1, boardf2); + + let temp_mask = _mm512_mask_cmpeq_epi16_mask(answer_mask[0], answer, boardf); + count_match += _popcnt32(temp_mask as i32); + + for i in 1..10 { + board0 = _mm512_slli_epi32(board0, 1); + board1 = _mm512_slli_epi32(board1, 1); + + let boardf1 = _mm512_and_si512(answer_color, board0); + let boardf2 = _mm512_and_si512(answer_empty, board1); + let boardf = _mm512_or_si512(boardf1, boardf2); + + let temp_mask = _mm512_mask_cmpeq_epi16_mask(answer_mask[i], answer, boardf); + count_match += _popcnt32(temp_mask as i32); + } + } + + if count_match > 0 { + return true; + } else { + return false; + } +} + +#[target_feature(enable = "avx512f,avx512bw")] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +unsafe fn check_patterndead4_avx512(pos: &Pos, sd: Side) -> i32 { + let coloridx = sd as usize; + let emptyidx = Color::Empty as usize; + + #[rustfmt::skip] + let answer_color: [__m512i; 5] = [_mm512_set1_epi16( (1<<14)|(1<<13)|(1<<12)|(1<<11) ), + _mm512_set1_epi16( (1<<15)| (1<<13)|(1<<12)|(1<<11) ), + _mm512_set1_epi16( (1<<15)|(1<<14) |(1<<12)|(1<<11) ), + _mm512_set1_epi16( (1<<15)|(1<<14)|(1<<13) |(1<<11) ), + _mm512_set1_epi16( (1<<15)|(1<<14)|(1<<13)|(1<<12) )]; + #[rustfmt::skip] + let answer_empty: [__m512i; 5]= [_mm512_set1_epi16( 1<<15 ), + _mm512_set1_epi16( 1<<14 ), + _mm512_set1_epi16( 1<<13 ), + _mm512_set1_epi16( 1<<12 ), + _mm512_set1_epi16( 1<<11)]; + #[rustfmt::skip] + let answer = _mm512_set1_epi16( (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)); + + #[rustfmt::skip] + let answer_mask: [__mmask32; 11] = [0b01_11_11_11_11_11_11_11_11_11_11_11_11_11_11_11, + 0b01_11_11_11_11_11_11_11_11_11_11_11_11_11_11_10, + 0b01_11_11_11_11_11_11_11_11_11_11_11_11_11_10_10, + 0b01_11_11_11_11_11_11_11_11_11_11_11_11_10_10_10, + 0b01_11_11_11_11_11_11_11_11_11_11_11_10_10_10_10, + 0b01_11_11_11_11_11_11_11_11_11_11_10_10_10_10_10, + 0b00_11_11_11_11_11_11_11_11_11_10_10_10_10_11_10, + 0b00_10_11_11_11_11_11_11_11_10_10_10_10_11_11_10, + 0b00_10_10_11_11_11_11_11_10_10_10_10_11_11_11_10, + 0b00_10_10_10_11_11_11_10_10_10_10_11_11_11_11_10, + 0b00_10_10_10_10_11_10_10_10_10_11_11_11_11_11_10]; + let board0org: [__m512i; 2] = [ + _mm512_loadu_epi32(&pos.bitboard[coloridx][0][0]), + _mm512_loadu_epi32(&pos.bitboard[coloridx][1][0]), + ]; + let board1org: [__m512i; 2] = [ + _mm512_loadu_epi32(&pos.bitboard[emptyidx][0][0]), + _mm512_loadu_epi32(&pos.bitboard[emptyidx][1][0]), + ]; + + let mut count_match: i32 = 0; + + for pattern in 0..5 { + for dir in 0..2 { + let mut board0 = board0org[dir]; + let mut board1 = board1org[dir]; + + let boardf1 = _mm512_and_si512(answer_color[pattern], board0); + let boardf2 = _mm512_and_si512(answer_empty[pattern], board1); + let boardf = _mm512_or_si512(boardf1, boardf2); + + let temp_mask = _mm512_mask_cmpeq_epi16_mask(answer_mask[0], answer, boardf); + count_match += _popcnt32(temp_mask as i32); + + for i in 1..11 { + board0 = _mm512_slli_epi32(board0, 1); + board1 = _mm512_slli_epi32(board1, 1); + + let boardf1 = _mm512_and_si512(answer_color[pattern], board0); + let boardf2 = _mm512_and_si512(answer_empty[pattern], board1); + let boardf = _mm512_or_si512(boardf1, boardf2); + + let temp_mask = _mm512_mask_cmpeq_epi16_mask(answer_mask[i], answer, boardf); + count_match += _popcnt32(temp_mask as i32); + } + } + } + + count_match +} + +#[target_feature(enable = "avx512f,avx512bw")] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +unsafe fn check_patternlive3_avx512(pos: &Pos, sd: Side) -> i32 { + let coloridx = sd as usize; + let emptyidx = Color::Empty as usize; + + #[rustfmt::skip] + let board0org: [__m512i; 2] = [_mm512_loadu_epi32(&pos.bitboard[coloridx][0][0]), _mm512_loadu_epi32(&pos.bitboard[coloridx][1][0])]; + #[rustfmt::skip] + let board1org: [__m512i; 2] = [_mm512_loadu_epi32(&pos.bitboard[emptyidx][0][0]), _mm512_loadu_epi32(&pos.bitboard[emptyidx][1][0])]; + + #[rustfmt::skip] + let answer_color: [__m512i; 1] = [_mm512_set1_epi16( (1<<14)|(1<<13)|(1<<12) )]; + #[rustfmt::skip] + let answer_empty: [__m512i; 1] = [_mm512_set1_epi16( (1<<15)| (1<<11) )]; + #[rustfmt::skip] + let answer: __m512i = _mm512_set1_epi16( (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11) ); + + let mut count_match: i32 = 0; + + #[rustfmt::skip] + let answer_mask: [__mmask32; 11] = [0b01_11_11_11_11_11_11_11_11_11_11_11_11_11_11_11, + 0b01_11_11_11_11_11_11_11_11_11_11_11_11_11_11_10, + 0b01_11_11_11_11_11_11_11_11_11_11_11_11_11_10_10, + 0b01_11_11_11_11_11_11_11_11_11_11_11_11_10_10_10, + 0b01_11_11_11_11_11_11_11_11_11_11_11_10_10_10_10, + 0b01_11_11_11_11_11_11_11_11_11_11_10_10_10_10_10, + 0b00_11_11_11_11_11_11_11_11_11_10_10_10_10_11_10, + 0b00_10_11_11_11_11_11_11_11_10_10_10_10_11_11_10, + 0b00_10_10_11_11_11_11_11_10_10_10_10_11_11_11_10, + 0b00_10_10_10_11_11_11_10_10_10_10_11_11_11_11_10, + 0b00_10_10_10_10_11_10_10_10_10_11_11_11_11_11_10]; + for pattern in 0..1 { + for dir in 0..2 { + let mut board0 = board0org[dir]; + let mut board1 = board1org[dir]; + + let boardf1 = _mm512_and_si512(answer_color[pattern], board0); + let boardf2 = _mm512_and_si512(answer_empty[pattern], board1); + let boardf = _mm512_or_si512(boardf1, boardf2); + + let temp_mask = _mm512_mask_cmpeq_epi16_mask(answer_mask[0], answer, boardf); + count_match += _popcnt32(temp_mask as i32); + + for i in 1..11 { + board0 = _mm512_slli_epi32(board0, 1); + board1 = _mm512_slli_epi32(board1, 1); + + let boardf1 = _mm512_and_si512(answer_color[pattern], board0); + let boardf2 = _mm512_and_si512(answer_empty[pattern], board1); + let boardf = _mm512_or_si512(boardf1, boardf2); + + let temp_mask = _mm512_mask_cmpeq_epi16_mask(answer_mask[i], answer, boardf); + count_match += _popcnt32(temp_mask as i32); + } + } + } + + #[rustfmt::skip] + let answer_color: [__m512i; 2] = [_mm512_set1_epi16( (1<<14)| (1<<12)|(1<<11) ), + _mm512_set1_epi16( (1<<14)|(1<<13) |(1<<11) )]; + #[rustfmt::skip] + let answer_empty: [__m512i; 2] = [_mm512_set1_epi16( (1<<15)| (1<<13)| (1<<10) ), + _mm512_set1_epi16( (1<<15)| (1<<12)| (1<<10) )]; + #[rustfmt::skip] + let answer: __m512i = _mm512_set1_epi16( (1<<15)|(1<<14)|(1<<13)|(1<<12)|(1<<11)|(1<<10) ); + + #[rustfmt::skip] + let answer_mask: [__mmask32; 10] = [0b01_11_11_11_11_11_11_11_11_11_11_11_11_11_11_10, + 0b01_11_11_11_11_11_11_11_11_11_11_11_11_11_10_10, + 0b01_11_11_11_11_11_11_11_11_11_11_11_11_10_10_10, + 0b01_11_11_11_11_11_11_11_11_11_11_11_10_10_10_10, + 0b01_11_11_11_11_11_11_11_11_11_11_10_10_10_10_10, + 0b00_11_11_11_11_11_11_11_11_11_10_10_10_10_10_10, + 0b00_10_11_11_11_11_11_11_11_10_10_10_10_10_11_10, + 0b00_10_10_11_11_11_11_11_10_10_10_10_10_11_11_10, + 0b00_10_10_10_11_11_11_10_10_10_10_10_11_11_11_10, + 0b00_10_10_10_10_11_10_10_10_10_10_11_11_11_11_10]; + for pattern in 0..2 { + for dir in 0..2 { + let mut board0 = board0org[dir]; + let mut board1 = board1org[dir]; + + let boardf1 = _mm512_and_si512(answer_color[pattern], board0); + let boardf2 = _mm512_and_si512(answer_empty[pattern], board1); + let boardf = _mm512_or_si512(boardf1, boardf2); + + let temp_mask = _mm512_mask_cmpeq_epi16_mask(answer_mask[0], answer, boardf); + count_match += _popcnt32(temp_mask as i32); + + for i in 1..10 { + board0 = _mm512_slli_epi32(board0, 1); + board1 = _mm512_slli_epi32(board1, 1); + + let boardf1 = _mm512_and_si512(answer_color[pattern], board0); + let boardf2 = _mm512_and_si512(answer_empty[pattern], board1); + let boardf = _mm512_or_si512(boardf1, boardf2); + + let temp_mask = _mm512_mask_cmpeq_epi16_mask(answer_mask[i], answer, boardf); + count_match += _popcnt32(temp_mask as i32); + } + } + } + + count_match +} + +fn main() { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + if is_x86_feature_detected!("avx512bw") { + println!("\n\nThe program is running with avx512f and avx512bw intrinsics\n\n"); + } else { + println!("\n\nThe program is running with NO intrinsics.\n\n"); + } + } + + #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] + { + println!("\n\nThe program is running with NO intrinsics.\n\n"); + } + + loop { + let start = Instant::now(); + + println!("Hello, this is Connect5 (Outer-Open Gomoku)!"); + println!("Self-playing with search depth = 4"); + + let test_state: [Color; SQUARE_SIZE as usize] = [Color::Empty; SQUARE_SIZE as usize]; + let test_bitboard: [[[i32; 16]; 2]; 3] = [[[0; 16]; 2]; 3]; + + let mut test1 = Pos { + state: test_state, + p_turn: Color::Black, + bitboard: test_bitboard, + }; + + test1.init(); + + let mut count: i32 = 0; + + for i in 0..(FILE_SIZE * RANK_SIZE) { + let mut next_move: Move = square_make(1, 7); // set the first move is (1,7) + + if i > 0 { + next_move = search(&test1, -EVAL_INF, EVAL_INF, 4, 0); + } // search depth = 4 + + test1.do_move(next_move); + pos_disp(&test1); + + if pos_is_end(&test1) { + println!("Game over!!!!!! at Move {}", i); + count = i + 1; + break; + } + } + + let duration = start.elapsed(); + + println!( + "Average time for each move is: {:?}", + duration / count as u32 + ); + } +} diff --git a/library/stdarch/examples/hex.rs b/library/stdarch/examples/hex.rs new file mode 100644 index 000000000..812836d66 --- /dev/null +++ b/library/stdarch/examples/hex.rs @@ -0,0 +1,401 @@ +//! An example showing runtime dispatch to an architecture-optimized +//! implementation. +//! +//! This program implements hex encoding a slice into a predetermined +//! destination using various different instruction sets. This selects at +//! runtime the most optimized implementation and uses that rather than being +//! required to be compiled differently. +//! +//! You can test out this program via: +//! +//! echo test | cargo +nightly run --release hex +//! +//! and you should see `746573740a` get printed out. + +#![feature(stdsimd, wasm_target_feature)] +#![cfg_attr(test, feature(test))] +#![allow( + clippy::unwrap_used, + clippy::print_stdout, + clippy::unwrap_used, + clippy::shadow_reuse, + clippy::cast_possible_wrap, + clippy::cast_ptr_alignment, + clippy::cast_sign_loss, + clippy::missing_docs_in_private_items +)] + +use std::{ + io::{self, Read}, + str, +}; + +#[cfg(target_arch = "x86")] +use {core_arch::arch::x86::*, std_detect::is_x86_feature_detected}; +#[cfg(target_arch = "x86_64")] +use {core_arch::arch::x86_64::*, std_detect::is_x86_feature_detected}; + +fn main() { + let mut input = Vec::new(); + io::stdin().read_to_end(&mut input).unwrap(); + let mut dst = vec![0; 2 * input.len()]; + let s = hex_encode(&input, &mut dst).unwrap(); + println!("{}", s); +} + +fn hex_encode<'a>(src: &[u8], dst: &'a mut [u8]) -> Result<&'a str, usize> { + let len = src.len().checked_mul(2).unwrap(); + if dst.len() < len { + return Err(len); + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + if is_x86_feature_detected!("avx2") { + return unsafe { hex_encode_avx2(src, dst) }; + } + if is_x86_feature_detected!("sse4.1") { + return unsafe { hex_encode_sse41(src, dst) }; + } + } + #[cfg(target_arch = "wasm32")] + { + if true { + return unsafe { hex_encode_simd128(src, dst) }; + } + } + + hex_encode_fallback(src, dst) +} + +#[target_feature(enable = "avx2")] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +unsafe fn hex_encode_avx2<'a>(mut src: &[u8], dst: &'a mut [u8]) -> Result<&'a str, usize> { + let ascii_zero = _mm256_set1_epi8(b'0' as i8); + let nines = _mm256_set1_epi8(9); + let ascii_a = _mm256_set1_epi8((b'a' - 9 - 1) as i8); + let and4bits = _mm256_set1_epi8(0xf); + + let mut i = 0_isize; + while src.len() >= 32 { + let invec = _mm256_loadu_si256(src.as_ptr() as *const _); + + let masked1 = _mm256_and_si256(invec, and4bits); + let masked2 = _mm256_and_si256(_mm256_srli_epi64(invec, 4), and4bits); + + // return 0xff corresponding to the elements > 9, or 0x00 otherwise + let cmpmask1 = _mm256_cmpgt_epi8(masked1, nines); + let cmpmask2 = _mm256_cmpgt_epi8(masked2, nines); + + // add '0' or the offset depending on the masks + let masked1 = _mm256_add_epi8(masked1, _mm256_blendv_epi8(ascii_zero, ascii_a, cmpmask1)); + let masked2 = _mm256_add_epi8(masked2, _mm256_blendv_epi8(ascii_zero, ascii_a, cmpmask2)); + + // interleave masked1 and masked2 bytes + let res1 = _mm256_unpacklo_epi8(masked2, masked1); + let res2 = _mm256_unpackhi_epi8(masked2, masked1); + + // Store everything into the right destination now + let base = dst.as_mut_ptr().offset(i * 2); + let base1 = base.offset(0) as *mut _; + let base2 = base.offset(16) as *mut _; + let base3 = base.offset(32) as *mut _; + let base4 = base.offset(48) as *mut _; + _mm256_storeu2_m128i(base3, base1, res1); + _mm256_storeu2_m128i(base4, base2, res2); + src = &src[32..]; + i += 32; + } + + let i = i as usize; + let _ = hex_encode_sse41(src, &mut dst[i * 2..]); + + Ok(str::from_utf8_unchecked(&dst[..src.len() * 2 + i * 2])) +} + +// copied from https://github.com/Matherunner/bin2hex-sse/blob/master/base16_sse4.cpp +#[target_feature(enable = "sse4.1")] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +unsafe fn hex_encode_sse41<'a>(mut src: &[u8], dst: &'a mut [u8]) -> Result<&'a str, usize> { + let ascii_zero = _mm_set1_epi8(b'0' as i8); + let nines = _mm_set1_epi8(9); + let ascii_a = _mm_set1_epi8((b'a' - 9 - 1) as i8); + let and4bits = _mm_set1_epi8(0xf); + + let mut i = 0_isize; + while src.len() >= 16 { + let invec = _mm_loadu_si128(src.as_ptr() as *const _); + + let masked1 = _mm_and_si128(invec, and4bits); + let masked2 = _mm_and_si128(_mm_srli_epi64(invec, 4), and4bits); + + // return 0xff corresponding to the elements > 9, or 0x00 otherwise + let cmpmask1 = _mm_cmpgt_epi8(masked1, nines); + let cmpmask2 = _mm_cmpgt_epi8(masked2, nines); + + // add '0' or the offset depending on the masks + let masked1 = _mm_add_epi8(masked1, _mm_blendv_epi8(ascii_zero, ascii_a, cmpmask1)); + let masked2 = _mm_add_epi8(masked2, _mm_blendv_epi8(ascii_zero, ascii_a, cmpmask2)); + + // interleave masked1 and masked2 bytes + let res1 = _mm_unpacklo_epi8(masked2, masked1); + let res2 = _mm_unpackhi_epi8(masked2, masked1); + + _mm_storeu_si128(dst.as_mut_ptr().offset(i * 2) as *mut _, res1); + _mm_storeu_si128(dst.as_mut_ptr().offset(i * 2 + 16) as *mut _, res2); + src = &src[16..]; + i += 16; + } + + let i = i as usize; + let _ = hex_encode_fallback(src, &mut dst[i * 2..]); + + Ok(str::from_utf8_unchecked(&dst[..src.len() * 2 + i * 2])) +} + +#[cfg(target_arch = "wasm32")] +#[target_feature(enable = "simd128")] +unsafe fn hex_encode_simd128<'a>(mut src: &[u8], dst: &'a mut [u8]) -> Result<&'a str, usize> { + use core_arch::arch::wasm32::*; + + let ascii_zero = u8x16_splat(b'0'); + let nines = u8x16_splat(9); + let ascii_a = u8x16_splat(b'a' - 9 - 1); + let and4bits = u8x16_splat(0xf); + + let mut i = 0_isize; + while src.len() >= 16 { + let invec = v128_load(src.as_ptr() as *const _); + + let masked1 = v128_and(invec, and4bits); + let masked2 = v128_and(u8x16_shr(invec, 4), and4bits); + + // return 0xff corresponding to the elements > 9, or 0x00 otherwise + let cmpmask1 = u8x16_gt(masked1, nines); + let cmpmask2 = u8x16_gt(masked2, nines); + + // add '0' or the offset depending on the masks + let masked1 = u8x16_add(masked1, v128_bitselect(ascii_a, ascii_zero, cmpmask1)); + let masked2 = u8x16_add(masked2, v128_bitselect(ascii_a, ascii_zero, cmpmask2)); + + // Next we need to shuffle around masked{1,2} to get back to the + // original source text order. The first element (res1) we'll store uses + // all the low bytes from the 2 masks and the second element (res2) uses + // all the upper bytes. + let res1 = u8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>( + masked2, masked1, + ); + let res2 = u8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>( + masked2, masked1, + ); + + v128_store(dst.as_mut_ptr().offset(i * 2) as *mut _, res1); + v128_store(dst.as_mut_ptr().offset(i * 2 + 16) as *mut _, res2); + src = &src[16..]; + i += 16; + } + + let i = i as usize; + let _ = hex_encode_fallback(src, &mut dst[i * 2..]); + + Ok(str::from_utf8_unchecked(&dst[..src.len() * 2 + i * 2])) +} + +fn hex_encode_fallback<'a>(src: &[u8], dst: &'a mut [u8]) -> Result<&'a str, usize> { + fn hex(byte: u8) -> u8 { + static TABLE: &[u8] = b"0123456789abcdef"; + TABLE[byte as usize] + } + + for (byte, slots) in src.iter().zip(dst.chunks_mut(2)) { + slots[0] = hex((*byte >> 4) & 0xf); + slots[1] = hex(*byte & 0xf); + } + + unsafe { Ok(str::from_utf8_unchecked(&dst[..src.len() * 2])) } +} + +// Run these with `cargo +nightly test --example hex -p stdarch` +#[cfg(test)] +mod tests { + use std::iter; + + use super::*; + + fn test(input: &[u8], output: &str) { + let tmp = || vec![0; input.len() * 2]; + + assert_eq!(hex_encode_fallback(input, &mut tmp()).unwrap(), output); + assert_eq!(hex_encode(input, &mut tmp()).unwrap(), output); + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + unsafe { + if self::is_x86_feature_detected!("avx2") { + assert_eq!(hex_encode_avx2(input, &mut tmp()).unwrap(), output); + } + if self::is_x86_feature_detected!("sse4.1") { + assert_eq!(hex_encode_sse41(input, &mut tmp()).unwrap(), output); + } + } + } + + #[test] + fn empty() { + test(b"", ""); + } + + #[test] + fn big() { + test( + &[0; 1024], + &iter::repeat('0').take(2048).collect::(), + ); + } + + #[test] + fn odd() { + test( + &[0; 313], + &iter::repeat('0').take(313 * 2).collect::(), + ); + } + + #[test] + fn avx_works() { + let mut input = [0; 33]; + input[4] = 3; + input[16] = 3; + input[17] = 0x30; + input[21] = 1; + input[31] = 0x24; + test( + &input, + "\ + 0000000003000000\ + 0000000000000000\ + 0330000000010000\ + 0000000000000024\ + 00\ + ", + ); + } + + quickcheck::quickcheck! { + fn encode_equals_fallback(input: Vec) -> bool { + let mut space1 = vec![0; input.len() * 2]; + let mut space2 = vec![0; input.len() * 2]; + let a = hex_encode(&input, &mut space1).unwrap(); + let b = hex_encode_fallback(&input, &mut space2).unwrap(); + a == b + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + fn avx_equals_fallback(input: Vec) -> bool { + if !self::is_x86_feature_detected!("avx2") { + return true + } + let mut space1 = vec![0; input.len() * 2]; + let mut space2 = vec![0; input.len() * 2]; + let a = unsafe { hex_encode_avx2(&input, &mut space1).unwrap() }; + let b = hex_encode_fallback(&input, &mut space2).unwrap(); + a == b + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + fn sse41_equals_fallback(input: Vec) -> bool { + if !self::is_x86_feature_detected!("avx2") { + return true + } + let mut space1 = vec![0; input.len() * 2]; + let mut space2 = vec![0; input.len() * 2]; + let a = unsafe { hex_encode_sse41(&input, &mut space1).unwrap() }; + let b = hex_encode_fallback(&input, &mut space2).unwrap(); + a == b + } + } +} + +// Run these with `cargo +nightly bench --example hex -p stdarch` +#[cfg(test)] +mod benches { + extern crate rand; + extern crate test; + + use self::rand::Rng; + + use super::*; + + const SMALL_LEN: usize = 117; + const LARGE_LEN: usize = 1 * 1024 * 1024; + + fn doit( + b: &mut test::Bencher, + len: usize, + f: for<'a> unsafe fn(&[u8], &'a mut [u8]) -> Result<&'a str, usize>, + ) { + let mut rng = rand::thread_rng(); + let input = std::iter::repeat(()) + .map(|()| rng.gen::()) + .take(len) + .collect::>(); + let mut dst = vec![0; input.len() * 2]; + b.bytes = len as u64; + b.iter(|| unsafe { + f(&input, &mut dst).unwrap(); + dst[0] + }); + } + + #[bench] + fn small_default(b: &mut test::Bencher) { + doit(b, SMALL_LEN, hex_encode); + } + + #[bench] + fn small_fallback(b: &mut test::Bencher) { + doit(b, SMALL_LEN, hex_encode_fallback); + } + + #[bench] + fn large_default(b: &mut test::Bencher) { + doit(b, LARGE_LEN, hex_encode); + } + + #[bench] + fn large_fallback(b: &mut test::Bencher) { + doit(b, LARGE_LEN, hex_encode_fallback); + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + mod x86 { + use super::*; + + #[bench] + fn small_avx2(b: &mut test::Bencher) { + if self::is_x86_feature_detected!("avx2") { + doit(b, SMALL_LEN, hex_encode_avx2); + } + } + + #[bench] + fn small_sse41(b: &mut test::Bencher) { + if self::is_x86_feature_detected!("sse4.1") { + doit(b, SMALL_LEN, hex_encode_sse41); + } + } + + #[bench] + fn large_avx2(b: &mut test::Bencher) { + if self::is_x86_feature_detected!("avx2") { + doit(b, LARGE_LEN, hex_encode_avx2); + } + } + + #[bench] + fn large_sse41(b: &mut test::Bencher) { + if self::is_x86_feature_detected!("sse4.1") { + doit(b, LARGE_LEN, hex_encode_sse41); + } + } + } +} diff --git a/library/stdarch/examples/wasm.rs b/library/stdarch/examples/wasm.rs new file mode 100644 index 000000000..6b92ae9b8 --- /dev/null +++ b/library/stdarch/examples/wasm.rs @@ -0,0 +1,45 @@ +//! A simple slab allocator for pages in wasm + +#![feature(stdsimd)] +#![cfg(target_arch = "wasm32")] + +use std::ptr; + +use core_arch::arch::wasm32::*; + +static mut HEAD: *mut *mut u8 = 0 as _; + +#[no_mangle] +pub unsafe extern "C" fn page_alloc() -> *mut u8 { + if !HEAD.is_null() { + let next = *HEAD; + let ret = HEAD; + HEAD = next as *mut _; + return ret as *mut u8; + } + + let ret = memory_grow(0, 1); + + // if we failed to allocate a page then return null + if ret == usize::MAX { + return ptr::null_mut(); + } + + ((ret as u32) * page_size()) as *mut u8 +} + +#[no_mangle] +pub unsafe extern "C" fn page_free(page: *mut u8) { + let page = page as *mut *mut u8; + *page = HEAD as *mut u8; + HEAD = page; +} + +#[no_mangle] +pub unsafe extern "C" fn memory_used() -> usize { + (page_size() * (memory_size(0) as u32)) as usize +} + +fn page_size() -> u32 { + 64 * 1024 +} -- cgit v1.2.3