diff options
Diffstat (limited to 'dev')
-rw-r--r-- | dev/flags/flags.c | 8 | ||||
-rwxr-xr-x | dev/h2/mkhdr.sh | 80 | ||||
-rw-r--r-- | dev/haring/haring.c | 174 | ||||
-rw-r--r-- | dev/patchbot/README | 395 | ||||
-rw-r--r-- | dev/patchbot/prompts/prompt14-2.9-airo14-pfx.txt | 70 | ||||
-rw-r--r-- | dev/patchbot/prompts/prompt14-2.9-alpaca-pfx.txt | 68 | ||||
-rw-r--r-- | dev/patchbot/prompts/prompt14-2.9-alpaca-sfx.txt | 28 | ||||
-rw-r--r-- | dev/patchbot/prompts/prompt14-2.9-chatml-pfx.txt | 67 | ||||
-rw-r--r-- | dev/patchbot/prompts/prompt14-2.9-chatml-sfx.txt | 28 | ||||
-rw-r--r-- | dev/patchbot/prompts/prompt14-2.9-mist7b-sfx.txt | 29 | ||||
-rw-r--r-- | dev/patchbot/prompts/prompt15-3.1-mist7bv2-pfx.txt | 70 | ||||
-rw-r--r-- | dev/patchbot/prompts/prompt15-3.1-mist7bv2-sfx.txt | 29 | ||||
-rwxr-xr-x | dev/patchbot/scripts/post-ai.sh | 372 | ||||
-rwxr-xr-x | dev/patchbot/scripts/process-patch-v15.sh | 63 | ||||
-rwxr-xr-x | dev/patchbot/scripts/submit-ai.sh | 79 | ||||
-rwxr-xr-x | dev/patchbot/scripts/update-3.0.sh | 66 | ||||
-rw-r--r-- | dev/phash/phash.c | 113 |
17 files changed, 1663 insertions, 76 deletions
diff --git a/dev/flags/flags.c b/dev/flags/flags.c index 65af237..8da485b 100644 --- a/dev/flags/flags.c +++ b/dev/flags/flags.c @@ -12,6 +12,7 @@ #include <haproxy/mux_fcgi-t.h> #include <haproxy/mux_h2-t.h> #include <haproxy/mux_h1-t.h> +#include <haproxy/peers-t.h> #include <haproxy/stconn-t.h> #include <haproxy/stream-t.h> #include <haproxy/task-t.h> @@ -36,10 +37,13 @@ #define SHOW_AS_H1S 0x00010000 #define SHOW_AS_FCONN 0x00020000 #define SHOW_AS_FSTRM 0x00040000 +#define SHOW_AS_PEERS 0x00080000 +#define SHOW_AS_PEER 0x00100000 // command line names, must be in exact same order as the SHOW_AS_* flags above // so that show_as_words[i] matches flag 1U<<i. -const char *show_as_words[] = { "ana", "chn", "conn", "sc", "stet", "strm", "task", "txn", "sd", "hsl", "htx", "hmsg", "fd", "h2c", "h2s", "h1c", "h1s", "fconn", "fstrm"}; +const char *show_as_words[] = { "ana", "chn", "conn", "sc", "stet", "strm", "task", "txn", "sd", "hsl", "htx", "hmsg", "fd", "h2c", "h2s", "h1c", "h1s", "fconn", "fstrm", + "peers", "peer"}; /* will be sufficient for even largest flag names */ static char buf[4096]; @@ -152,6 +156,8 @@ int main(int argc, char **argv) if (show_as & SHOW_AS_H1S) printf("h1s->flags = %s\n", (h1s_show_flags (buf, bsz, " | ", flags), buf)); if (show_as & SHOW_AS_FCONN) printf("fconn->flags = %s\n",(fconn_show_flags (buf, bsz, " | ", flags), buf)); if (show_as & SHOW_AS_FSTRM) printf("fstrm->flags = %s\n",(fstrm_show_flags (buf, bsz, " | ", flags), buf)); + if (show_as & SHOW_AS_PEERS) printf("peers->flags = %s\n",(peers_show_flags (buf, bsz, " | ", flags), buf)); + if (show_as & SHOW_AS_PEER) printf("peer->flags = %s\n", (peer_show_flags (buf, bsz, " | ", flags), buf)); } return 0; } diff --git a/dev/h2/mkhdr.sh b/dev/h2/mkhdr.sh index 4d129fa..4ed1a07 100755 --- a/dev/h2/mkhdr.sh +++ b/dev/h2/mkhdr.sh @@ -4,9 +4,13 @@ # All fields are optional. 0 assumed when absent. USAGE=\ -"Usage: %s [-l <len> ] [-t <type>] [-f <flags>] [-i <sid>] [ -d <data> ] > hdr.bin +"Usage: %s [-l <len> ] [-t <type>] [-f <flags>[,...]] [-i <sid>] [ -d <data> ] + [ -e <name> <value> ]* [ -r|-R raw ] [ -h | --help ] > hdr.bin Numbers are decimal or 0xhex. Not set=0. If <data> is passed, it points - to a file that is read and chunked into frames of <len> bytes. + to a file that is read and chunked into frames of <len> bytes. -e + encodes a headers frame (by default) with all headers at once encoded + in literal. Use type 'p' for the preface. Use -r to pass raw data or + -R to pass raw hex codes (hex digit pairs, blanks ignored). Supported symbolic types (case insensitive prefix match): DATA (0x00) PUSH_PROMISE (0x05) @@ -25,6 +29,8 @@ LEN= TYPE= FLAGS= ID= +RAW= +HDR=( ) die() { [ "$#" -eq 0 ] || echo "$*" >&2 @@ -48,7 +54,7 @@ mkframe() { local T="${2:-0}" local F="${3:-0}" local I="${4:-0}" - local t f + local t f f2 f3 # get the first match in this order for t in DATA:0x00 HEADERS:0x01 RST_STREAM:0x03 SETTINGS:0x04 PING:0x06 \ @@ -66,17 +72,37 @@ mkframe() { die fi - # get the first match in this order - for f in ES:0x01 EH:0x04 PAD:0x08 PRIO:0x20; do - if [ -z "${f##${F^^*}*}" ]; then - F="${f##*:}" + # get the first match in this order, for each entry delimited by ','. + # E.g.: "-f ES,EH" + f2=${F^^*}; F=0 + + while [ -n "$f2" ]; do + f3="${f2%%,*}" + tmp="" + for f in ES:0x01 EH:0x04 PAD:0x08 PRIO:0x20; do + if [ -n "$f3" -a -z "${f##${f3}*}" ]; then + tmp="${f#*:}" + break + fi + done + + if [ -n "$tmp" ]; then + F=$(( F | tmp )) + f2="${f2#$f3}" + f2="${f2#,}" + elif [ -z "${f3##[X0-9A-F]*}" ]; then + F=$(( F | f3 )) + f2="${f2#$f3}" + f2="${f2#,}" + else + echo "Unknown flag(s) '$f3'" >&2 + usage "${0##*}" + die fi done - if [ -n "${F##[0-9]*}" ]; then - echo "Unknown type '$T'" >&2 - usage "${0##*}" - die + if [ -n "$f2" ]; then + F="${f2} | ${F}" fi L=$(( L )); T=$(( T )); F=$(( F )); I=$(( I )) @@ -110,6 +136,9 @@ while [ -n "$1" -a -z "${1##-*}" ]; do -f) FLAGS="$2" ; shift 2 ;; -i) ID="$2" ; shift 2 ;; -d) DATA="$2" ; shift 2 ;; + -r) RAW="$2" ; shift 2 ;; + -R) RAW="$(printf $(echo -n "${2// /}" | sed -e 's/\([^ ][^ ]\)/\\\\x\1/g'))" ; shift 2 ;; + -e) TYPE=1; HDR[${#HDR[@]}]="$2=$3"; shift 3 ;; -h|--help) usage "${0##*}"; quit;; *) usage "${0##*}"; die ;; esac @@ -135,8 +164,35 @@ if [ -n "${ID##[0-9]*}" ]; then die fi -if [ -z "$DATA" ]; then +if [ "$TYPE" = "p" ]; then + printf "PRI * HTTP/2.0\r\n\r\nSM\r\n\r\n" +elif [ -z "$DATA" ]; then + # If we're trying to emit literal headers, let's pre-build the raw data + # and measure their total length. + if [ ${#HDR[@]} -gt 0 ]; then + # limited to 127 bytes for name and value + for h in "${HDR[@]}"; do + n=${h%%=*} + v=${h#*=} + nl=${#n} + vl=${#v} + nl7=$(printf "%02x" $((nl & 127))) + vl7=$(printf "%02x" $((vl & 127))) + RAW="${RAW}\x40\x${nl7}${n}\x${vl7}${v}" + done + fi + + # compute length if RAW set + if [ -n "$RAW" ]; then + LEN=$(printf "${RAW}" | wc -c) + fi + mkframe "$LEN" "$TYPE" "$FLAGS" "$ID" + + # now emit the literal data of advertised length + if [ -n "$RAW" ]; then + printf "${RAW}" + fi else # read file $DATA in <LEN> chunks and send it in multiple frames # advertising their respective lengths. diff --git a/dev/haring/haring.c b/dev/haring/haring.c index ee7e1aa..4dfdafa 100644 --- a/dev/haring/haring.c +++ b/dev/haring/haring.c @@ -35,12 +35,34 @@ #include <haproxy/api.h> #include <haproxy/buf.h> -#include <haproxy/ring.h> +#include <haproxy/ring-t.h> +#include <haproxy/thread.h> int force = 0; // force access to a different layout int lfremap = 0; // remap LF in traces int repair = 0; // repair file +struct ring_v1 { + struct buffer buf; // storage area +}; + +// ring v2 format (not aligned) +struct ring_v2 { + size_t size; // storage size + size_t rsvd; // header length (used for file-backed maps) + size_t tail; // storage tail + size_t head; // storage head + char area[0]; // storage area begins immediately here +}; + +// ring v2 format (thread aligned) +struct ring_v2a { + size_t size; // storage size + size_t rsvd; // header length (used for file-backed maps) + size_t tail __attribute__((aligned(64))); // storage tail + size_t head __attribute__((aligned(64))); // storage head + char area[0] __attribute__((aligned(64))); // storage area begins immediately here +}; /* display the message and exit with the code */ __attribute__((noreturn)) void die(int code, const char *format, ...) @@ -69,75 +91,21 @@ __attribute__((noreturn)) void usage(int code, const char *arg0) "", arg0); } -/* This function dumps all events from the ring whose pointer is in <p0> into - * the appctx's output buffer, and takes from <o0> the seek offset into the - * buffer's history (0 for oldest known event). It looks at <i0> for boolean - * options: bit0 means it must wait for new data or any key to be pressed. Bit1 - * means it must seek directly to the end to wait for new contents. It returns - * 0 if the output buffer or events are missing is full and it needs to be - * called again, otherwise non-zero. It is meant to be used with - * cli_release_show_ring() to clean up. +/* dump a ring represented in a pre-initialized buffer, starting from offset + * <ofs> and with flags <flags> */ -int dump_ring(struct ring *ring, size_t ofs, int flags) +int dump_ring_as_buf(struct buffer buf, size_t ofs, int flags) { - struct buffer buf; uint64_t msg_len = 0; size_t len, cnt; const char *blk1 = NULL, *blk2 = NULL, *p; size_t len1 = 0, len2 = 0, bl; - /* Explanation: the storage area in the writing process starts after - * the end of the structure. Since the whole area is mmapped(), we know - * it starts at 0 mod 4096, hence the buf->area pointer's 12 LSB point - * to the relative offset of the storage area. As there will always be - * users using the wrong version of the tool with a dump, we need to - * run a few checks first. After that we'll create our own buffer - * descriptor matching that area. - */ - if ((((long)ring->buf.area) & 4095) != sizeof(*ring)) { - if (!force) { - fprintf(stderr, "FATAL: header in file is %ld bytes long vs %ld expected!\n", - (((long)ring->buf.area) & 4095), - (long)sizeof(*ring)); - exit(1); - } - else { - fprintf(stderr, "WARNING: header in file is %ld bytes long vs %ld expected!\n", - (((long)ring->buf.area) & 4095), - (long)sizeof(*ring)); - } - /* maybe we could emit a warning at least ? */ - } - - /* Now make our own buffer pointing to that area */ - buf = b_make(((void *)ring + (((long)ring->buf.area) & 4095)), - ring->buf.size, ring->buf.head, ring->buf.data); - - /* explanation for the initialization below: it would be better to do - * this in the parsing function but this would occasionally result in - * dropped events because we'd take a reference on the oldest message - * and keep it while being scheduled. Thus instead let's take it the - * first time we enter here so that we have a chance to pass many - * existing messages before grabbing a reference to a location. This - * value cannot be produced after initialization. - */ - if (unlikely(ofs == ~0)) { - ofs = 0; - - /* going to the end means looking at tail-1 */ - ofs = (flags & RING_WF_SEEK_NEW) ? buf.data - 1 : 0; - - //HA_ATOMIC_INC(b_peek(&buf, ofs)); - } - while (1) { - //HA_RWLOCK_RDLOCK(RING_LOCK, &ring->lock); - if (ofs >= buf.size) { fprintf(stderr, "FATAL error at %d\n", __LINE__); return 1; } - //HA_ATOMIC_DEC(b_peek(&buf, ofs)); /* in this loop, ofs always points to the counter byte that precedes * the message so that we can take our reference there if we have to @@ -198,9 +166,6 @@ int dump_ring(struct ring *ring, size_t ofs, int flags) ofs += cnt + msg_len; } - //HA_ATOMIC_INC(b_peek(&buf, ofs)); - //HA_RWLOCK_RDUNLOCK(RING_LOCK, &ring->lock); - if (!(flags & RING_WF_WAIT_MODE)) break; @@ -210,9 +175,84 @@ int dump_ring(struct ring *ring, size_t ofs, int flags) return 0; } +/* This function dumps all events from the ring <ring> from offset <ofs> and + * with flags <flags>. + */ +int dump_ring_v1(struct ring_v1 *ring, size_t ofs, int flags) +{ + struct buffer buf; + + /* Explanation: the storage area in the writing process starts after + * the end of the structure. Since the whole area is mmapped(), we know + * it starts at 0 mod 4096, hence the buf->area pointer's 12 LSB point + * to the relative offset of the storage area. As there will always be + * users using the wrong version of the tool with a dump, we need to + * run a few checks first. After that we'll create our own buffer + * descriptor matching that area. + */ + + /* Now make our own buffer pointing to that area */ + buf = b_make(((void *)ring + (((long)ring->buf.area) & 4095)), + ring->buf.size, ring->buf.head, ring->buf.data); + + return dump_ring_as_buf(buf, ofs, flags); +} + +/* This function dumps all events from the ring <ring> from offset <ofs> and + * with flags <flags>. + */ +int dump_ring_v2(struct ring_v2 *ring, size_t ofs, int flags) +{ + size_t size, head, tail, data; + struct buffer buf; + + /* In ring v2 format, we have in this order: + * - size + * - hdr len (reserved bytes) + * - tail + * - head + * We can rebuild an equivalent buffer from these info for the function + * to dump. + */ + + /* Now make our own buffer pointing to that area */ + size = ring->size; + head = ring->head; + tail = ring->tail & ~RING_TAIL_LOCK; + data = (head <= tail ? 0 : size) + tail - head; + buf = b_make((void *)ring + ring->rsvd, size, head, data); + return dump_ring_as_buf(buf, ofs, flags); +} + +/* This function dumps all events from the ring <ring> from offset <ofs> and + * with flags <flags>. + */ +int dump_ring_v2a(struct ring_v2a *ring, size_t ofs, int flags) +{ + size_t size, head, tail, data; + struct buffer buf; + + /* In ring v2 format, we have in this order: + * - size + * - hdr len (reserved bytes) + * - tail + * - head + * We can rebuild an equivalent buffer from these info for the function + * to dump. + */ + + /* Now make our own buffer pointing to that area */ + size = ring->size; + head = ring->head; + tail = ring->tail & ~RING_TAIL_LOCK; + data = (head <= tail ? 0 : size) + tail - head; + buf = b_make((void *)ring + ring->rsvd, size, head, data); + return dump_ring_as_buf(buf, ofs, flags); +} + int main(int argc, char **argv) { - struct ring *ring; + void *ring; struct stat statbuf; const char *arg0; int fd; @@ -254,7 +294,15 @@ int main(int argc, char **argv) return 1; } - return dump_ring(ring, ~0, 0); + if (((struct ring_v2 *)ring)->rsvd < 4096 && // not a pointer (v1), must be ringv2's rsvd + ((struct ring_v2 *)ring)->rsvd + ((struct ring_v2 *)ring)->size == statbuf.st_size) { + if (((struct ring_v2 *)ring)->rsvd < 192) + return dump_ring_v2(ring, 0, 0); + else + return dump_ring_v2a(ring, 0, 0); // thread-aligned version + } + else + return dump_ring_v1(ring, 0, 0); } diff --git a/dev/patchbot/README b/dev/patchbot/README new file mode 100644 index 0000000..a645cc3 --- /dev/null +++ b/dev/patchbot/README @@ -0,0 +1,395 @@ +Patchbot: AI bot making use of Natural Language Processing to suggest backports +=============================================================== 2023-12-18 ==== + + +Background +---------- + +Selecting patches to backport from the development branch is a tedious task, in +part due to the abundance of patches and the fact that many bug fixes are for +that same version and not for backporting. The more it gets delayed, the harder +it becomes, and the harder it is to start, the less likely it gets started. The +urban legend along which one "just" has to do that periodically doesn't work +because certain patches need to be left hanging for a while under observation, +others need to be merged urgently, and for some, the person in charge of the +backport might simply need an opinion from the patch's author or the affected +subsystem maintainer, and this cannot make the whole backport process stall. + +The information needed to figure if a patch needs to be backported is present +in the commit message, with varying nuances such as "may", "may not", "should", +"probably", "shouldn't unless", "keep under observation" etc. One particularly +that is specific to backports is that the opinion on a patch may change over +time, either because it was later found to be wrong or insufficient, or because +the former analysis mistakenly suggested to backport or not to. + +This means that the person in charge of the backports has to read the whole +commit message for each patch, to figure the backporting instructions, and this +takes a while. + +Several attempts were made over the years to try to partially automate this +task, including the cherry-pick mode of the "git-show-backports" utility that +eases navigation back-and-forth between commits. + +Lately, a lot of progress was made in the domain of Natural Language +Understanding (NLU) and more generally Natural Language Processing (NLP). Since +the first attempts in early 2023 involving successive layers of the Roberta +model, called from totally unreliable Python code, and December 2023, the +situation evolved from promising but unusable to mostly autonomous. + +For those interested in history, the first attempts in early 2023 involved +successive layers of the Roberta model, but these were relying on totally +unreliable Python code that broke all the time and could barely be transferred +to another machine without upgrading or downgrading the installed modules, and +it used to use huge amounts of resources for a somewhat disappointing result: +the verdicts were correct roughly 60-70% of the time, it was not possible to +get hints such as "wait" nor even "uncertain". It could just be qualified as +promising. Another big limitation was the limit to 256 tokens, forcing the +script to select only the last few lines of the commit message to take the +decision. Roughly at the same time, in March 2023 Meta issued their much larger +LLaMa model, and Georgi Gerganov released "llama.cpp", an open-source C++ +engine that loads and runs such large models without all the usual problems +inherent to the Python ecosystem. New attempts were made with LLaMa and it was +already much better than Roberta, but the output was difficult to parse, and it +required to be combined with the final decision layer of Roberta. Then new +variants of LLaMa appeared such as Alpaca, which follows instructions, but +tends to forget them if given before the patch, then Vicuna which was pretty +reliable but very slow at 33B size and difficult to tune, then Airoboros, +which was the first one to give very satisfying results in a reasonable time, +following instructions reasonably closely with a stable output, but with +sometimes surprising analysis and contradictions. It was already about 90% +reliable and considered as a time saver in 13B size. Other models were later +tried as they appeared such as OpenChat-3.5, Juna, OpenInstruct, Orca-2, +Mistral-0.1 and it variants Neural and OpenHermes-2.5. Mistral showed an +unrivaled understanding despite being smaller and much faster than other ones, +but was a bit freewheeling regarding instructions. Dolphin-2.1 rebased on top +of it gave extremely satisfying results, with less variations in the output +format, but still the script had difficulties trying to catch its conclusion +from time to time, though it was pretty much readable for the human in charge +of the task. And finally just before releasing, Mistral-0.2 was released and +addressed all issues, with a human-like understanding and perfectly obeying +instructions, providing an extremely stable output format that is easy to parse +from simple scripts. The decisions now match the human's ones in close to 100% +of the patches, unless the human is aware of extra context, of course. + + +Architecture +------------ + +The current solution relies on the llama.cpp engine, which is a simple, fast, +reliable and portable engine to load models and run inference, and the +Mistral-0.2 LLM. + +A collection of patches is built from the development branch since the -dev0 +tag, and for each of them, the engine is called to evaluate the developer's +intent based on the commit message. A detailed context explaining the haproxy +maintenance model and what the user wants is passed, then the LLM is invited to +provide its opinion on the need for a backport and an explanation of the reason +for its choice. This often helps the user to find a quick summary about the +patch. All these outputs are then converted to a long HTML page with colors and +radio buttons, where patches are pre-selected based on this classification, +that the user can consult and adjust, read the commits if needed, and the +selected patches finally provide some copy-pastable commands in a text-area to +select commit IDs to work on, typically in a form that's suitable for a simple +"git cherry-pick -sx". + +The scripts are designed to be able to run on a headless machine, called from a +crontab and with the output served from a static HTTP server. + +The code is currently found from Georgi Gerganov's repository: + + https://github.com/ggerganov/llama.cpp + +Tag b1505 is known to work fine, and uses the GGUF file format. + +The model(s) can be found on Hugging Face user "TheBloke"'s collection of +models: + + https://huggingface.co/TheBloke + +Model Mistral-7B-Instruct-v0.2-GGUF quantized at Q5K_M is known to work well +with the llama.cpp version above. + + +Deployment +---------- + +Note: it is a good idea to start to download the model(s) in the background as + such files are typically 5 GB or more and can take some time to download + depending on the internet bandwidth. + +It seems reasonable to create a dedicated user to periodically run this task. +Let's call it "patchbot". Developers should be able to easily run a shell from +this user to perform some maintenance or testing (e.g. "sudo"). + +All paths are specified in the example "update-3.0.sh" script, and assume a +deployment in the user's home, so this is what is being described here. The +proposed deployment layout is the following: + + $HOME (e.g. /home/patchbot) + | + +- data + | | + | +-- models # GGUF files from TheBloke's collection + | | + | +-- prompts # prompt*-pfx*, prompt*-sfx*, cache + | | + | +-- in + | | | + | | +-- haproxy # haproxy Git repo + | | | + | | +-- patches-3.0 # patches from development branch 3.0 + | | + | +-- out # report directory (HTML) + | + +- prog + | | + | +-- bin # program(s) + | | + | +-- scripts # processing scripts + | | + | +-- llama.cpp # llama Git repository + + +- Let's first create the structure: + + mkdir -p ~/data/{in,models,prompts} ~/prog/{bin,scripts} + +- data/in/haproxy must contain a clone of the haproxy development tree that + will periodically be pulled from: + + cd ~/data/in + git clone https://github.com/haproxy/haproxy + cd ~ + +- The prompt files are a copy of haproxy's "dev/patchbot/prompt/" subdirectory. + The prompt files are per-version because they contain references to the + haproxy development version number. For each prompt, there is a prefix + ("-pfx"), that is loaded before the patch, and a suffix ("-sfx") that + precises the user's expectations after reading the patch. For best efficiency + it's useful to place most of the explanation in the prefix and the least + possible in the suffix, because the prefix is cacheable. Different models + will use different instructions formats and different explanations, so it's + fine to keep a collection of prompts and use only one. Different instruction + formats are commonly used, "llama-2", "alpaca", "vicuna", "chatml" being + common. When experimenting with a new model, just copy-paste the closest one + and tune it for best results. Since we already cloned haproxy above, we'll + take the files from there: + + cp ~/data/in/haproxy/dev/patchbot/prompt/*txt ~/data/prompts/ + + Upon first run, a cache file will be produced in this directory by parsing + an empty file and saving the current model's context. The cache file will + automatically be deleted and rebuilt if it is absent or older than the prefix + or suffix file. The cache files are specific to a model so when experimenting + with other models, be sure not to reuse the same cache file, or in doubt, + just delete them. Rebuilding the cache file typically takes around 2 minutes + of processing on a 8-core machine. + +- The model(s) from TheBloke's Hugging Face account have to be downloaded in + GGUF file format, quantized at Q5K_M, and stored as-is into data/models/. + +- data/in/patches-3.0/ is where the "mk-patch-list.sh" script will emit the + patches corresponding to new commits in the development branch. Its suffix + must match the name of the current development branch for patches to be found + there. In addition, the classification of the patches will be emitted there + next to the input patches, with the same name as the original file with a + suffix indicating what model/prompt combination was used. + + mkdir -p ~/data/in/patches-3.0 + +- data/out is where the final report will be emitted. If running on a headless + machine, it is worth making sure that this directory is accessible from a + static web server. Thus either create a directory and place a symlink or + configuration somewhere in the web server's settings to reference this + location, or make it a symlink to another place already exported by the web + server and make sure the user has the permissions to write there. + + mkdir -p ~/data/out + + On Ubuntu-20.04 it was found that the package "micro-httpd" works out of the + box serving /var/www/html and follows symlinks. As such this is sufficient to + expose the reports: + + sudo ln -s ~patchbot/data/out /var/www/html/patchbot + +- prog/bin will contain the executable(s) needed to operate, namely "main" from + llama.cpp: + + mkdir -p ~/prog/bin + +- prog/llama.cpp is a clone of the "llama.cpp" GitHub repository. As of + december 2023, the project has improved its forward compatibility and it's + generally both safe and recommended to stay on the last version, hence to + just clone the master branch. In case of difficulties, tag b1505 was proven + to work well with the aforementioned model. Building is done by default for + the local platform, optimised for speed with native CPU. + + mkdir -p ~/prog + cd ~/prog + git clone https://github.com/ggerganov/llama.cpp + [ only in case of problems: cd llama.cpp && git checkout b1505 ] + + make -j$(nproc) main LLAMA_FAST=1 + cp main ~/prog/bin/ + cd ~ + +- prog/scripts needs the following scripts: + - mk-patch-list.sh from haproxy's scripts/ subdirectory + - submit-ai.sh, process-*.sh, post-ai.sh, update-*.sh + + cp ~/data/in/haproxy/scripts/mk-patch-list.sh ~/prog/scripts/ + cp ~/data/in/haproxy/dev/patchbot/scripts/*.sh ~/prog/scripts/ + + - verify that the various paths in update-3.0.sh match your choices, or + adjust them: + + vi ~/prog/scripts/update-3.0.sh + + - the tool is memory-bound, so a machine with more memory channels and/or + very fast memory will usually be faster than a higher CPU count with a + lower memory bandwidth. In addition, the performance is not linear with + the number of cores and experimentation shows that efficiency drops above + 8 threads. For this reason the script integrates a "PARALLEL_RUNS" variable + indicating how many instances to run in parallel, each on its own patch. + This allows to make better use of the CPUs and memory bandwidth. Setting + 2 instances for 8 cores / 16 threads gives optimal results on dual memory + channel systems. + +From this point, executing this update script manually should work and produce +the result. Count around 0.5-2 mn per patch on a 8-core machine, so it can be +reasonably fast during the early development stages (before -dev1) but +unbearably long later, where it can make more sense to run it at night. It +should not report any error and should only report the total execution time. + +If interrupted (Ctrl-C, logout, out of memory etc), check for incomplete .txt +files in ~/data/in/patches*/ that can result from this interruption, and delete +them because they will not be reproduced: + + ls -lart ~/data/in/patches-3.0/*.txt + ls -lS ~/data/in/patches-3.0/*.txt + +Once the output is produced, visit ~/data/out/ using a web browser and check +that the table loads correctly. Note that after a new release or a series of +backports, the table may appear empty, it's just because all known patches are +already backported and collapsed by default. Clicking on "All" at the top left +will unhide them. + +Finally when satisfied, place it in a crontab, for example, run every hour: + + crontab -e + + # m h dom mon dow command + # run every hour at minute 02 + 2 * * * * /home/patchbot/update-3.0.sh + + +Usage +----- + +Using the HTML output is a bit rustic but efficient. The interface is split in +5 columns from left to right: + + - first column: patch number from 1 to N, just to ease navigation. Below the + number appears a radio button which allows to mark this patch as the start + of the review. When clicked, all prior patches disappear and are not listed + anymore. This can be undone by clicking on the radio button under the "All" + word in this column's header. + + + - second column: commit ID (abbreviated "CID" in the header). It's a 8-digit + shortened representation of the commit ID. It's presented as a link, which, + if clicked, will directly show that commit from the haproxy public + repository. Below the commit ID is the patch's author date in condensed + format "DD-MmmYY", e.g. "18-Dec23" for "18th December 2023". It was found + that having a date indication sometimes helps differentiate certain related + patches. + + - third column: "Subject", this is the subject of the patch, prefixed with + the 4-digit number matching the file name in the directory (e.g. helps to + remove or reprocess one if needed). This is also a link to the same commit + in the haproxy's public repository. At the lower right under the subject + is the shortened e-mail address (only user@domain keeping only the first + part of the domain, e.g. "foo@haproxy"). Just like with the date, it helps + figuring what to expect after a recent discussion with a developer. + + - fourth column: "Verdict". This column contains 4 radio buttons prefiguring + the choice for this patch between "N" for "No", represented in gray (this + patch should not be backported, let's drop it), "U" for "Uncertain" in + green (still unsure about it, most likely the author should be contacted), + "W" for "Wait" in blue (this patch should be backported but not + immediately, only after it has spent some time in the development branch), + and "Y" for "Yes" in red (this patch must be backported, let's pick it). + The choice is preselected by the scripts above, and since these are radio + buttons, the user is free to change this selection. Reloading will lose the + user's choices. When changing a selection, the line's background changes to + match a similar color tone, allowing to visually spot preselected patches. + + - fifth column: reason for the choice. The scripts try to provide an + explanation for the choice of the preselection, and try to always end with + a conclusion among "yes", "no", "wait", "uncertain". The explanation + usually fits in 2-4 lines and is faster to read than a whole commit message + and very often pretty accurate. It's also been noticed that Mistral-v0.2 + shows much less hallucinations than others (it doesn't seem to invent + information that was not part of its input), so seeing certain topics being + discussed there generally indicate that they were in the original commit + message. The scripts try to emphasize the sensitive parts of the commit + message such as risks, dependencies, referenced issues, oldest version to + backport to, etc. Elements that look like issues numbers and commit IDs are + turned to links to ease navigation. + +In addition, in order to improve readability, the top of the table shows 4 +buttons allowing to show/hide each category. For example, when trying to focus +only on "uncertain" and "wait", it can make sense to hide "N" and "Y" and click +"Y" or "N" on the displayed ones until there is none anymore. + +In order to reduce the risk of missing a misqualified patch, those marked "BUG" +or "DOC" are displayed in bold even if tagged "No". It has been shown to be +sufficient to catch the eye when scrolling and encouraging to re-visit them. + +More importantly, the script will try to also check which patches were already +backported to the previous stable version. Those that were backported will have +the first two columns colored gray, and by default, the review will start from +the first patch after the last backported one. This explains why just after a +backport, the table may appear empty with only the footer "New" checked. + +Finally, at the bottom of the table is an editable, copy-pastable text area +that is redrawn at each click. It contains a series of 4 shell commands that +can be copy-pasted at once and assign commit IDs to 4 variables, one per +category. Most often only "y" will be of interest, so for example if the +review process ends with: + + cid_y=( 7dab3e82 456ba6e9 75f5977f 917f7c74 ) + +Then copy-pasting it in a terminal already in the haproxy-2.9 directory and +issuing: + + git cherry-pick -sx ${cid_y[@]} + +Will result in all these patches to be backported to that version. + + +Criticisms +---------- + +The interface is absolutely ugly but gets the job done. Proposals to revamp it +are welcome, provided that they do not alter usability and portability (e.g. +the ability to open the locally produced file without requiring access to an +external server). + + +Thanks +------ + +This utility is the proof that boringly repetitive tasks that can be offloaded +from humans can save their time to do more productive things. This work which +started with extremely limited tools was made possible thanks to Meta, for +opening their models after leaking it, Georgi Gerganov and the community that +developed around llama.cpp, for creating the first really open engine that +builds out of the box and just works, contrary to the previous crippled Python- +only ecosystem, Tom Jobbins (aka TheBloke) for making it so easy to discover +new models every day by simply quantizing all of them and making them available +from a single location, MistralAI for producing an exceptionally good model +that surpasses all others, is the first one to feel as smart and accurate as a +real human on such tasks, is fast, and totally free, and of course, HAProxy +Technologies for investing some time on this and for the available hardware +that permits a lot of experimentation. diff --git a/dev/patchbot/prompts/prompt14-2.9-airo14-pfx.txt b/dev/patchbot/prompts/prompt14-2.9-airo14-pfx.txt new file mode 100644 index 0000000..2f3fde2 --- /dev/null +++ b/dev/patchbot/prompts/prompt14-2.9-airo14-pfx.txt @@ -0,0 +1,70 @@ +BEGININPUT +BEGINCONTEXT + +HAProxy's development cycle consists in one development branch, and multiple +maintenance branches. + +All the development is made into the development branch exclusively. This +includes mostly new features, doc updates, cleanups and or course, fixes. + +The maintenance branches, also called stable branches, never see any +development, and only receive ultra-safe fixes for bugs that affect them, +that are picked from the development branch. + +Branches are numbered in 0.1 increments. Every 6 months, upon a new major +release, the development branch enters maintenance and a new development branch +is created with a new, higher version. The current development branch is +2.9-dev, and maintenance branches are 2.8 and below. + +Fixes created in the development branch for issues that were introduced in an +earlier branch are applied in descending order to each and every version till +that branch that introduced the issue: 2.8 first, then 2.7, then 2.6 and so +on. This operation is called "backporting". A fix for an issue is never +backported beyond the branch that introduced the issue. An important point is +that the project maintainers really aim at zero regression in maintenance +branches, so they're never willing to take any risk backporting patches that +are not deemed strictly necessary. + +Fixes consist of patches managed using the Git version control tool and are +identified by a Git commit ID and a commit message. For this reason we +indistinctly talk about backporting fixes, commits, or patches; all mean the +same thing. When mentioning commit IDs, developers always use a short form +made of the first 8 characters only, and expect the AI assistant to do the +same. + +It seldom happens that some fixes depend on changes that were brought by other +patches that were not in some branches and that will need to be backported as +well for the fix to work. In this case, such information is explicitly provided +in the commit message by the patch's author in natural language. + +Developers are serious and always indicate if a patch needs to be backported. +Sometimes they omit the exact target branch, or they will say that the patch is +"needed" in some older branch, but it means the same. If a commit message +doesn't mention any backport instructions, it means that the commit does not +have to be backported. And patches that are not strictly bug fixes nor doc +improvements are normally not backported. For example, fixes for design +limitations, architectural improvements and performance optimizations are +considered too risky for a backport. Finally, all bug fixes are tagged as +"BUG" at the beginning of their subject line. Patches that are not tagged as +such are not bugs, and must never be backported unless their commit message +explicitly requests so. + +ENDCONTEXT + +A developer is reviewing the development branch, trying to spot which commits +need to be backported to maintenance branches. This person is already expert +on HAProxy and everything related to Git, patch management, and the risks +associated with backports, so he doesn't want to be told how to proceed nor to +review the contents of the patch. + +The goal for this developer is to get some help from the AI assistant to save +some precious time on this tedious review work. In order to do a better job, he +needs an accurate summary of the information and instructions found in each +commit message. Specifically he needs to figure if the patch fixes a problem +affecting an older branch or not, if it needs to be backported, if so to which +branches, and if other patches need to be backported along with it. + +The indented text block below after an "id" line and starting with a Subject line +is a commit message from the HAProxy development branch that describes a patch +applied to that branch, starting with its subject line, please read it carefully. + diff --git a/dev/patchbot/prompts/prompt14-2.9-alpaca-pfx.txt b/dev/patchbot/prompts/prompt14-2.9-alpaca-pfx.txt new file mode 100644 index 0000000..cabe7f0 --- /dev/null +++ b/dev/patchbot/prompts/prompt14-2.9-alpaca-pfx.txt @@ -0,0 +1,68 @@ +### Instruction: + +HAProxy's development cycle consists in one development branch, and multiple +maintenance branches. + +All the development is made into the development branch exclusively. This +includes mostly new features, doc updates, cleanups and or course, fixes. + +The maintenance branches, also called stable branches, never see any +development, and only receive ultra-safe fixes for bugs that affect them, +that are picked from the development branch. + +Branches are numbered in 0.1 increments. Every 6 months, upon a new major +release, the development branch enters maintenance and a new development branch +is created with a new, higher version. The current development branch is +2.9-dev, and maintenance branches are 2.8 and below. + +Fixes created in the development branch for issues that were introduced in an +earlier branch are applied in descending order to each and every version till +that branch that introduced the issue: 2.8 first, then 2.7, then 2.6 and so +on. This operation is called "backporting". A fix for an issue is never +backported beyond the branch that introduced the issue. An important point is +that the project maintainers really aim at zero regression in maintenance +branches, so they're never willing to take any risk backporting patches that +are not deemed strictly necessary. + +Fixes consist of patches managed using the Git version control tool and are +identified by a Git commit ID and a commit message. For this reason we +indistinctly talk about backporting fixes, commits, or patches; all mean the +same thing. When mentioning commit IDs, developers always use a short form +made of the first 8 characters only, and expect the AI assistant to do the +same. + +It seldom happens that some fixes depend on changes that were brought by other +patches that were not in some branches and that will need to be backported as +well for the fix to work. In this case, such information is explicitly provided +in the commit message by the patch's author in natural language. + +Developers are serious and always indicate if a patch needs to be backported. +Sometimes they omit the exact target branch, or they will say that the patch is +"needed" in some older branch, but it means the same. If a commit message +doesn't mention any backport instructions, it means that the commit does not +have to be backported. And patches that are not strictly bug fixes nor doc +improvements are normally not backported. For example, fixes for design +limitations, architectural improvements and performance optimizations are +considered too risky for a backport. Finally, all bug fixes are tagged as +"BUG" at the beginning of their subject line. Patches that are not tagged as +such are not bugs, and must never be backported unless their commit message +explicitly requests so. + +A developer is reviewing the development branch, trying to spot which commits +need to be backported to maintenance branches. This person is already expert +on HAProxy and everything related to Git, patch management, and the risks +associated with backports, so he doesn't want to be told how to proceed nor to +review the contents of the patch. + +The goal for this developer is to get some help from the AI assistant to save +some precious time on this tedious review work. In order to do a better job, he +needs an accurate summary of the information and instructions found in each +commit message. Specifically he needs to figure if the patch fixes a problem +affecting an older branch or not, if it needs to be backported, if so to which +branches, and if other patches need to be backported along with it. + +The indented text block below after an "id" line and starting with a Subject line +is a commit message from the HAProxy development branch that describes a patch +applied to that branch, starting with its subject line, please read it carefully. + +### Input: diff --git a/dev/patchbot/prompts/prompt14-2.9-alpaca-sfx.txt b/dev/patchbot/prompts/prompt14-2.9-alpaca-sfx.txt new file mode 100644 index 0000000..9906132 --- /dev/null +++ b/dev/patchbot/prompts/prompt14-2.9-alpaca-sfx.txt @@ -0,0 +1,28 @@ + +### Instruction: + +You are an AI assistant that follows instruction extremely well. Help as much +as you can, responding to a single question using a single response. + +The developer wants to know if he needs to backport the patch above to fix +maintenance branches, for which branches, and what possible dependencies might +be mentioned in the commit message. Carefully study the commit message and its +backporting instructions if any (otherwise it should probably not be backported), +then provide a very concise and short summary that will help the developer decide +to backport it, or simply to skip it. + +Start by explaining in one or two sentences what you recommend for this one and why. +Finally, based on your analysis, give your general conclusion as "Conclusion: X" +where X is a single word among: + - "yes", if you recommend to backport the patch right now either because + it explicitly states this or because it's a fix for a bug that affects + a maintenance branch (2.8 or lower); + - "wait", if this patch explicitly mentions that it must be backported, but + only after waiting some time. + - "no", if nothing clearly indicates a necessity to backport this patch (e.g. + lack of explicit backport instructions, or it's just an improvement); + - "uncertain" otherwise for cases not covered above + +### Response: + +Explanation: diff --git a/dev/patchbot/prompts/prompt14-2.9-chatml-pfx.txt b/dev/patchbot/prompts/prompt14-2.9-chatml-pfx.txt new file mode 100644 index 0000000..c35138e --- /dev/null +++ b/dev/patchbot/prompts/prompt14-2.9-chatml-pfx.txt @@ -0,0 +1,67 @@ +<|im_start|>system +HAProxy's development cycle consists in one development branch, and multiple +maintenance branches. + +All the development is made into the development branch exclusively. This +includes mostly new features, doc updates, cleanups and or course, fixes. + +The maintenance branches, also called stable branches, never see any +development, and only receive ultra-safe fixes for bugs that affect them, +that are picked from the development branch. + +Branches are numbered in 0.1 increments. Every 6 months, upon a new major +release, the development branch enters maintenance and a new development branch +is created with a new, higher version. The current development branch is +2.9-dev, and maintenance branches are 2.8 and below. + +Fixes created in the development branch for issues that were introduced in an +earlier branch are applied in descending order to each and every version till +that branch that introduced the issue: 2.8 first, then 2.7, then 2.6 and so +on. This operation is called "backporting". A fix for an issue is never +backported beyond the branch that introduced the issue. An important point is +that the project maintainers really aim at zero regression in maintenance +branches, so they're never willing to take any risk backporting patches that +are not deemed strictly necessary. + +Fixes consist of patches managed using the Git version control tool and are +identified by a Git commit ID and a commit message. For this reason we +indistinctly talk about backporting fixes, commits, or patches; all mean the +same thing. When mentioning commit IDs, developers always use a short form +made of the first 8 characters only, and expect the AI assistant to do the +same. + +It seldom happens that some fixes depend on changes that were brought by other +patches that were not in some branches and that will need to be backported as +well for the fix to work. In this case, such information is explicitly provided +in the commit message by the patch's author in natural language. + +Developers are serious and always indicate if a patch needs to be backported. +Sometimes they omit the exact target branch, or they will say that the patch is +"needed" in some older branch, but it means the same. If a commit message +doesn't mention any backport instructions, it means that the commit does not +have to be backported. And patches that are not strictly bug fixes nor doc +improvements are normally not backported. For example, fixes for design +limitations, architectural improvements and performance optimizations are +considered too risky for a backport. Finally, all bug fixes are tagged as +"BUG" at the beginning of their subject line. Patches that are not tagged as +such are not bugs, and must never be backported unless their commit message +explicitly requests so. + +A developer is reviewing the development branch, trying to spot which commits +need to be backported to maintenance branches. This person is already expert +on HAProxy and everything related to Git, patch management, and the risks +associated with backports, so he doesn't want to be told how to proceed nor to +review the contents of the patch. + +The goal for this developer is to get some help from the AI assistant to save +some precious time on this tedious review work. In order to do a better job, he +needs an accurate summary of the information and instructions found in each +commit message. Specifically he needs to figure if the patch fixes a problem +affecting an older branch or not, if it needs to be backported, if so to which +branches, and if other patches need to be backported along with it. + +The indented text block below after an "id" line and starting with a Subject line +is a commit message from the HAProxy development branch that describes a patch +applied to that branch, starting with its subject line, please read it carefully. +<|im_end|> +<|im_start|>user diff --git a/dev/patchbot/prompts/prompt14-2.9-chatml-sfx.txt b/dev/patchbot/prompts/prompt14-2.9-chatml-sfx.txt new file mode 100644 index 0000000..31e26d6 --- /dev/null +++ b/dev/patchbot/prompts/prompt14-2.9-chatml-sfx.txt @@ -0,0 +1,28 @@ +<|im_end|> +<|im_start|>system + +You are an AI assistant that follows instruction extremely well. Help as much +as you can, responding to a single question using a single response. + +The developer wants to know if he needs to backport the patch above to fix +maintenance branches, for which branches, and what possible dependencies might +be mentioned in the commit message. Carefully study the commit message and its +backporting instructions if any (otherwise it should probably not be backported), +then provide a very concise and short summary that will help the developer decide +to backport it, or simply to skip it. + +Start by explaining in one or two sentences what you recommend for this one and why. +Finally, based on your analysis, give your general conclusion as "Conclusion: X" +where X is a single word among: + - "yes", if you recommend to backport the patch right now either because + it explicitly states this or because it's a fix for a bug that affects + a maintenance branch (2.8 or lower); + - "wait", if this patch explicitly mentions that it must be backported, but + only after waiting some time. + - "no", if nothing clearly indicates a necessity to backport this patch (e.g. + lack of explicit backport instructions, or it's just an improvement); + - "uncertain" otherwise for cases not covered above +<|im_end|> +<|im_start|>assistant + +Explanation: diff --git a/dev/patchbot/prompts/prompt14-2.9-mist7b-sfx.txt b/dev/patchbot/prompts/prompt14-2.9-mist7b-sfx.txt new file mode 100644 index 0000000..3d1b03b --- /dev/null +++ b/dev/patchbot/prompts/prompt14-2.9-mist7b-sfx.txt @@ -0,0 +1,29 @@ + +ENDINPUT +BEGININSTRUCTION + +You are an AI assistant that follows instruction extremely well. Help as much +as you can, responding to a single question using a single response. + +The developer wants to know if he needs to backport the patch above to fix +maintenance branches, for which branches, and what possible dependencies might +be mentioned in the commit message. Carefully study the commit message and its +backporting instructions if any (otherwise it should probably not be backported), +then provide a very concise and short summary that will help the developer decide +to backport it, or simply to skip it. + +Start by explaining in one or two sentences what you recommend for this one and why. +Finally, based on your analysis, give your general conclusion as "Conclusion: X" +where X is a single word among: + - "yes", if you recommend to backport the patch right now either because + it explicitly states this or because it's a fix for a bug that affects + a maintenance branch (2.8 or lower); + - "wait", if this patch explicitly mentions that it must be backported, but + only after waiting some time. + - "no", if nothing clearly indicates a necessity to backport this patch (e.g. + lack of explicit backport instructions, or it's just an improvement); + - "uncertain" otherwise for cases not covered above + +ENDINSTRUCTION + +Explanation: diff --git a/dev/patchbot/prompts/prompt15-3.1-mist7bv2-pfx.txt b/dev/patchbot/prompts/prompt15-3.1-mist7bv2-pfx.txt new file mode 100644 index 0000000..3120167 --- /dev/null +++ b/dev/patchbot/prompts/prompt15-3.1-mist7bv2-pfx.txt @@ -0,0 +1,70 @@ +BEGININPUT +BEGINCONTEXT + +HAProxy's development cycle consists in one development branch, and multiple +maintenance branches. + +All the development is made into the development branch exclusively. This +includes mostly new features, doc updates, cleanups and or course, fixes. + +The maintenance branches, also called stable branches, never see any +development, and only receive ultra-safe fixes for bugs that affect them, +that are picked from the development branch. + +Branches are numbered in 0.1 increments. Every 6 months, upon a new major +release, the development branch enters maintenance and a new development branch +is created with a new, higher version. The current development branch is +3.1-dev, and maintenance branches are 3.0 and below. + +Fixes created in the development branch for issues that were introduced in an +earlier branch are applied in descending order to each and every version till +that branch that introduced the issue: 3.0 first, then 2.9, then 2.8 and so +on. This operation is called "backporting". A fix for an issue is never +backported beyond the branch that introduced the issue. An important point is +that the project maintainers really aim at zero regression in maintenance +branches, so they're never willing to take any risk backporting patches that +are not deemed strictly necessary. + +Fixes consist of patches managed using the Git version control tool and are +identified by a Git commit ID and a commit message. For this reason we +indistinctly talk about backporting fixes, commits, or patches; all mean the +same thing. When mentioning commit IDs, developers always use a short form +made of the first 8 characters only, and expect the AI assistant to do the +same. + +It seldom happens that some fixes depend on changes that were brought by other +patches that were not in some branches and that will need to be backported as +well for the fix to work. In this case, such information is explicitly provided +in the commit message by the patch's author in natural language. + +Developers are serious and always indicate if a patch needs to be backported. +Sometimes they omit the exact target branch, or they will say that the patch is +"needed" in some older branch, but it means the same. If a commit message +doesn't mention any backport instructions, it means that the commit does not +have to be backported. And patches that are not strictly bug fixes nor doc +improvements are normally not backported. For example, fixes for design +limitations, architectural improvements and performance optimizations are +considered too risky for a backport. Finally, all bug fixes are tagged as +"BUG" at the beginning of their subject line. Patches that are not tagged as +such are not bugs, and must never be backported unless their commit message +explicitly requests so. + +ENDCONTEXT + +A developer is reviewing the development branch, trying to spot which commits +need to be backported to maintenance branches. This person is already expert +on HAProxy and everything related to Git, patch management, and the risks +associated with backports, so he doesn't want to be told how to proceed nor to +review the contents of the patch. + +The goal for this developer is to get some help from the AI assistant to save +some precious time on this tedious review work. In order to do a better job, he +needs an accurate summary of the information and instructions found in each +commit message. Specifically he needs to figure if the patch fixes a problem +affecting an older branch or not, if it needs to be backported, if so to which +branches, and if other patches need to be backported along with it. + +The indented text block below after an "id" line and starting with a Subject line +is a commit message from the HAProxy development branch that describes a patch +applied to that branch, starting with its subject line, please read it carefully. + diff --git a/dev/patchbot/prompts/prompt15-3.1-mist7bv2-sfx.txt b/dev/patchbot/prompts/prompt15-3.1-mist7bv2-sfx.txt new file mode 100644 index 0000000..dd4280b --- /dev/null +++ b/dev/patchbot/prompts/prompt15-3.1-mist7bv2-sfx.txt @@ -0,0 +1,29 @@ + +ENDINPUT +BEGININSTRUCTION + +You are an AI assistant that follows instruction extremely well. Help as much +as you can, responding to a single question using a single response. + +The developer wants to know if he needs to backport the patch above to fix +maintenance branches, for which branches, and what possible dependencies might +be mentioned in the commit message. Carefully study the commit message and its +backporting instructions if any (otherwise it should probably not be backported), +then provide a very concise and short summary that will help the developer decide +to backport it, or simply to skip it. + +Start by explaining in one or two sentences what you recommend for this one and why. +Finally, based on your analysis, give your general conclusion as "Conclusion: X" +where X is a single word among: + - "yes", if you recommend to backport the patch right now either because + it explicitly states this or because it's a fix for a bug that affects + a maintenance branch (3.0 or lower); + - "wait", if this patch explicitly mentions that it must be backported, but + only after waiting some time. + - "no", if nothing clearly indicates a necessity to backport this patch (e.g. + lack of explicit backport instructions, or it's just an improvement); + - "uncertain" otherwise for cases not covered above + +ENDINSTRUCTION + +Explanation: diff --git a/dev/patchbot/scripts/post-ai.sh b/dev/patchbot/scripts/post-ai.sh new file mode 100755 index 0000000..7dba63a --- /dev/null +++ b/dev/patchbot/scripts/post-ai.sh @@ -0,0 +1,372 @@ +#!/bin/bash + +#### +#### Todo: +#### - change line color based on the selected radio button +#### - support collapsing lines per color/category (show/hide for each) +#### - add category "next" and see if the prompt can handle that (eg: d3e379b3) +#### - produce multiple lists on output (per category) allowing to save batches +#### + +die() { + [ "$#" -eq 0 ] || echo "$*" >&2 + exit 1 +} + +err() { + echo "$*" >&2 +} + +quit() { + [ "$#" -eq 0 ] || echo "$*" + exit 0 +} + +#### Main + +USAGE="Usage: ${0##*/} [ -h ] [ -b 'bkp_list' ] patch..." +MYSELF="$0" +GITURL="http://git.haproxy.org/?p=haproxy.git;a=commitdiff;h=" +ISSUES="https://github.com/haproxy/haproxy/issues/" +BKP="" + +while [ -n "$1" -a -z "${1##-*}" ]; do + case "$1" in + -h|--help) quit "$USAGE" ;; + -b) BKP="$2"; shift 2 ;; + *) die "$USAGE" ;; + esac +done + +PATCHES=( "$@" ) + +if [ ${#PATCHES[@]} = 0 ]; then + die "$USAGE" +fi + +# BKP is a space-delimited list of 8-char commit IDs, we'll +# assign them to the local bkp[] associative array. + +declare -A bkp + +for cid in $BKP; do + bkp[$cid]=1 +done + +# some colors +BG_B="#e0e0e0" +BT_N="gray"; BG_N="white" +BT_U="#00e000"; BG_U="#e0ffe0" +BT_W="#0060ff"; BG_W="#e0e0ff" +BT_Y="red"; BG_Y="#ffe0e0" + +echo "<HTML>" + +cat <<- EOF +<HEAD><style> +input.n[type="radio"] { + appearance: none; + width: 1.25em; + height: 1.25em; + border-radius: 50%; + border: 3px solid $BT_N; + background-color: transparent; +} +input.n[type="radio"]:checked { + appearance: none; + width: 1.25em; + height: 1.25em; + border-radius: 50%; + border: 2px solid black; + background-color: $BT_N; +} + +input.u[type="radio"] { + appearance: none; + width: 1.25em; + height: 1.25em; + border-radius: 50%; + border: 3px solid $BT_U; + background-color: transparent; +} +input.u[type="radio"]:checked { + appearance: none; + width: 1.25em; + height: 1.25em; + border-radius: 50%; + border: 2px solid black; + background-color: $BT_U; +} + +input.w[type="radio"] { + appearance: none; + width: 1.25em; + height: 1.25em; + border-radius: 50%; + border: 3px solid $BT_W; + background-color: transparent; +} +input.w[type="radio"]:checked { + appearance: none; + width: 1.25em; + height: 1.25em; + border-radius: 50%; + border: 2px solid black; + background-color: $BT_W; +} + +input.y[type="radio"] { + appearance: none; + width: 1.25em; + height: 1.25em; + border-radius: 50%; + border: 3px solid $BT_Y; + background-color: transparent; +} +input.y[type="radio"]:checked { + appearance: none; + width: 1.25em; + height: 1.25em; + border-radius: 50%; + border: 2px solid black; + background-color: $BT_Y; +} +</style> + +<script type="text/javascript"><!-- + +var nb_patches = 0; +var cid = []; +var bkp = []; + +// first line to review +var review = 0; + +// show/hide table lines and update their color +function updt_table(line) { + var b = document.getElementById("sh_b").checked; + var n = document.getElementById("sh_n").checked; + var u = document.getElementById("sh_u").checked; + var w = document.getElementById("sh_w").checked; + var y = document.getElementById("sh_y").checked; + var tn = 0, tu = 0, tw = 0, ty = 0; + var i, el; + + for (i = 1; i < nb_patches; i++) { + if (document.getElementById("bt_" + i + "_n").checked) { + tn++; + if (line && i != line) + continue; + el = document.getElementById("tr_" + i); + el.style.backgroundColor = "$BG_N"; + el.style.display = n && (b || !bkp[i]) && i >= review ? "" : "none"; + } + else if (document.getElementById("bt_" + i + "_u").checked) { + tu++; + if (line && i != line) + continue; + el = document.getElementById("tr_" + i); + el.style.backgroundColor = "$BG_U"; + el.style.display = u && (b || !bkp[i]) && i >= review ? "" : "none"; + } + else if (document.getElementById("bt_" + i + "_w").checked) { + tw++; + if (line && i != line) + continue; + el = document.getElementById("tr_" + i); + el.style.backgroundColor = "$BG_W"; + el.style.display = w && (b || !bkp[i]) && i >= review ? "" : "none"; + } + else if (document.getElementById("bt_" + i + "_y").checked) { + ty++; + if (line && i != line) + continue; + el = document.getElementById("tr_" + i); + el.style.backgroundColor = "$BG_Y"; + el.style.display = y && (b || !bkp[i]) && i >= review ? "" : "none"; + } + else { + // bug + if (line && i != line) + continue; + el = document.getElementById("tr_" + i); + el.style.backgroundColor = "red"; + el.style.display = ""; + } + } + document.getElementById("cnt_n").innerText = tn; + document.getElementById("cnt_u").innerText = tu; + document.getElementById("cnt_w").innerText = tw; + document.getElementById("cnt_y").innerText = ty; +} + +function updt_output() { + var b = document.getElementById("sh_b").checked; + var i, y = "", w = "", u = "", n = ""; + + for (i = 1; i < nb_patches; i++) { + if (i < review) + continue; + if (bkp[i]) + continue; + if (document.getElementById("bt_" + i + "_y").checked) + y = y + " " + cid[i]; + else if (document.getElementById("bt_" + i + "_w").checked) + w = w + " " + cid[i]; + else if (document.getElementById("bt_" + i + "_u").checked) + u = u + " " + cid[i]; + else if (document.getElementById("bt_" + i + "_n").checked) + n = n + " " + cid[i]; + } + + // update the textarea + document.getElementById("output").value = + "cid_y=(" + y + " )\n" + + "cid_w=(" + w + " )\n" + + "cid_u=(" + u + " )\n" + + "cid_n=(" + n + " )\n"; +} + +function updt(line,value) { + if (value == "r") { + review = line; + line = 0; // redraw everything + } + updt_table(line); + updt_output(); +} + +// --> +</script> +</HEAD> +EOF + +echo "<BODY>" +echo -n "<big><big>Show:" +echo -n " <span style='background-color:$BG_B'><input type='checkbox' onclick='updt_table(0);' id='sh_b' checked />B (${#bkp[*]})</span> " +echo -n " <span style='background-color:$BG_N'><input type='checkbox' onclick='updt_table(0);' id='sh_n' checked />N (<span id='cnt_n'>0</span>)</span> " +echo -n " <span style='background-color:$BG_U'><input type='checkbox' onclick='updt_table(0);' id='sh_u' checked />U (<span id='cnt_u'>0</span>)</span> " +echo -n " <span style='background-color:$BG_W'><input type='checkbox' onclick='updt_table(0);' id='sh_w' checked />W (<span id='cnt_w'>0</span>)</span> " +echo -n " <span style='background-color:$BG_Y'><input type='checkbox' onclick='updt_table(0);' id='sh_y' checked />Y (<span id='cnt_y'>0</span>)</span> " +echo -n "</big/></big> (B=show backported, N=no/drop, U=uncertain, W=wait/next, Y=yes/pick" +echo ")<P/>" + +echo "<TABLE COLS=5 BORDER=1 CELLSPACING=0 CELLPADDING=3>" +echo "<TR><TH>All<br/><input type='radio' name='review' onclick='updt(0,\"r\");' checked title='Start review here'/></TH><TH>CID</TH><TH>Subject</TH><TH>Verdict<BR>N U W Y</BR></TH><TH>Reason</TH></TR>" +seq_num=1; do_check=1; review=0; +for patch in "${PATCHES[@]}"; do + # try to retrieve the patch's numbering (0001-9999) + pnum="${patch##*/}" + pnum="${pnum%%[^0-9]*}" + + id=$(sed -ne 's/^#id: \(.*\)/\1/p' "$patch") + resp=$(grep -v ^llama "$patch" | sed -ne '/^Explanation:/,$p' | sed -z 's/\n[\n]*/\n/g' | sed -z 's/\([^. ]\)\n\([A-Z]\)/\1.\n\2/' | tr '\012' ' ') + resp="${resp#Explanation:}"; + while [ -n "$resp" -a -z "${resp##[ .]*}" ]; do + resp="${resp#[ .]}" + done + + respl=$(echo -- "$resp" | tr 'A-Z' 'a-z') + + if [[ "${respl}" =~ (conclusion|verdict)[:\ ][^.]*yes ]]; then + verdict=yes + elif [[ "${respl}" =~ (conclusion|verdict)[:\ ][^.]*wait ]]; then + verdict=wait + elif [[ "${respl}" =~ (conclusion|verdict)[:\ ][^.]*no ]]; then + verdict=no + elif [[ "${respl}" =~ (conclusion|verdict)[:\ ][^.]*uncertain ]]; then + verdict=uncertain + elif [[ "${respl}" =~ (\"wait\"|\"yes\"|\"no\"|\"uncertain\")[^\"]*$ ]]; then + # last word under quotes in the response, sometimes happens as + # in 'thus I would conclude "no"'. + verdict=${BASH_REMATCH[1]} + else + verdict=uncertain + fi + + verdict="${verdict//[\"\',;:. ]}" + verdict=$(echo -n "$verdict" | tr '[A-Z]' '[a-z]') + + # There are two formats for the ID line: + # - old: #id: cid subject + # - new: #id: cid author date subject + # We can detect the 2nd one as the date starts with a series of digits + # followed by "-" then an upper case letter (eg: "18-Dec23"). + set -- $id + cid="$1" + author="" + date="" + if [ -n "$3" ] && [ -z "${3##[1-9]-[A-Z]*}" -o -z "${3##[0-3][0-9]-[A-Z]*}" ]; then + author="$2" + date="$3" + subj="${id#$cid $author $date }" + else + subj="${id#$cid }" + fi + + if [ -z "$cid" ]; then + echo "ERROR: commit ID not found in patch $pnum: $patch" >&2 + continue + fi + + echo "<script type='text/javascript'>cid[$seq_num]='$cid'; bkp[$seq_num]=${bkp[$cid]:+1}+0;</script>" + + echo -n "<TR id='tr_$seq_num' name='$cid'" + + # highlight unqualified docs and bugs + if [ "$verdict" != "no" ]; then + : # no special treatment for accepted/uncertain elements + elif [ -z "${subj##BUG*}" ] && ! [[ "${respl}" =~ (explicitly|specifically|clearly|also|commit\ message|does)[\ ]*(state|mention|say|request) ]]; then + # bold for BUG marked "no" with no "explicitly states that ..." + echo -n " style='font-weight:bold'" + elif [ -z "${subj##DOC*}" ]; then # && ! [[ "${respl}" =~ (explicitly|specifically|clearly|also|commit\ message|does)[\ ]*(state|mention|say|request) ]]; then + # gray for DOC marked "no" + echo -n " style='font-weight:bold'" + #echo -n " bgcolor=#E0E0E0" #"$BG_U" + fi + + echo -n ">" + + # HTMLify subject and summary + subj="${subj//&/&}"; subj="${subj//</<}"; subj="${subj//>/>}"; + resp="${resp//&/&}"; resp="${resp//</<}"; resp="${resp//>/>}"; + + # turn "#XXXX" to a link to an issue + resp=$(echo "$resp" | sed -e "s|#\([0-9]\{1,5\}\)|<a href='${ISSUES}\1'>#\1</a>|g") + + # put links to commit IDs + resp=$(echo "$resp" | sed -e "s|\([0-9a-f]\{8,40\}\)|<a href='${GITURL}\1'>\1</a>|g") + + echo -n "<TD nowrap align=center ${bkp[$cid]:+style='background-color:${BG_B}'}>$seq_num<BR/>" + echo -n "<input type='radio' name='review' onclick='updt($seq_num,\"r\");' ${do_check:+checked} title='Start review here'/></TD>" + echo -n "<TD nowrap ${bkp[$cid]:+style='background-color:${BG_B}'}><tt><a href='${GITURL}${cid}'>$cid</a></tt>${date:+<br/><small style='font-weight:normal'>$date</small>}</TD>" + echo -n "<TD nowrap><a href='${GITURL}${cid}'>${pnum:+$pnum }$subj</a>${author:+<br/><div align=right><small style='font-weight:normal'>$author</small></div>}</TD>" + echo -n "<TD nowrap align=center>" + echo -n "<input type='radio' onclick='updt($seq_num,\"n\");' id='bt_${seq_num}_n' class='n' name='$cid' value='n' title='Drop' $( [ "$verdict" != no ] || echo -n checked) />" + echo -n "<input type='radio' onclick='updt($seq_num,\"u\");' id='bt_${seq_num}_u' class='u' name='$cid' value='u' title='Uncertain' $( [ "$verdict" != uncertain ] || echo -n checked) />" + echo -n "<input type='radio' onclick='updt($seq_num,\"w\");' id='bt_${seq_num}_w' class='w' name='$cid' value='w' title='wait in -next' $([ "$verdict" != wait ] || echo -n checked) />" + echo -n "<input type='radio' onclick='updt($seq_num,\"y\");' id='bt_${seq_num}_y' class='y' name='$cid' value='y' title='Pick' $( [ "$verdict" != yes ] || echo -n checked) />" + echo -n "</TD>" + echo -n "<TD>$resp</TD>" + echo "</TR>" + echo + ((seq_num++)) + + # if this patch was already backported, make the review start on the next + if [ -n "${bkp[$cid]}" ]; then + review=$seq_num + do_check=1 + else + do_check= + fi +done + +echo "<TR><TH>New<br/><input type='radio' name='review' onclick='updt($seq_num,\"r\");' ${do_check:+checked} title='Nothing to backport'/></TH><TH>CID</TH><TH>Subject</TH><TH>Verdict<BR>N U W Y</BR></TH><TH>Reason</TH></TR>" + +echo "</TABLE>" +echo "<P/>" +echo "<H3>Output:</H3>" +echo "<textarea cols=120 rows=10 id='output'></textarea>" +echo "<P/>" +echo "<script type='text/javascript'>nb_patches=$seq_num; review=$review; updt_table(0); updt_output();</script>" +echo "</BODY></HTML>" diff --git a/dev/patchbot/scripts/process-patch-v15.sh b/dev/patchbot/scripts/process-patch-v15.sh new file mode 100755 index 0000000..e9f718a --- /dev/null +++ b/dev/patchbot/scripts/process-patch-v15.sh @@ -0,0 +1,63 @@ +#!/bin/bash + +# the patch itself +F="$1" +shift + +# if non-empty, force to redo the patch +FORCE="${FORCE:-}" + +CPU="${CPU:-$(nproc)}" +MODEL="${MODEL:-../models/airoboros-l2-13b-gpt4-1.4.1.Q5_K_M.gguf}" +PROMPT_PFX="${PROMPT_PFX:-prompt14-airo14-pfx.txt}" +PROMPT_SFX="${PROMPT_SFX:-prompt14-airo14-sfx.txt}" +CACHE="${CACHE:-prompt-airo14.cache}" +CACHE_RO="${CACHE_RO- --prompt-cache-ro}" +EXT="${EXT:-airo14.txt}" +OUTPUT="${OUTPUT:-$(set -- "$F"."$EXT"; echo $1)}" +MAINPROG="${MAINPROG:-./main}" + +# switch to interactive mode with this reverse-prompt at the end if set. +# Typically: INTERACTIVE="Developer". +INTERACTIVE=${INTERACTIVE:-""} + +# Compute the full prompt +# +# Input format for "$F": git-format-patch with lines in this order: +# 1: From cid ... +# 2: From: author user@... +# 3: Date: +# 4: Subject: +# ... +# n: ^---$ +# It will emit a preliminary line with the commit ID, the author, the date, +# the subject, then the whole commit message indented. The output can be +# searched using grep '^\(Bot:\|#id:\)' + +PROMPT="$(cat "$PROMPT_PFX"; cat "$F" | sed -e '/^---/,$d' -e '/^Signed-off-by:/d' -e '/^Cc:/d' -e '/^Reported-by:/d' -e '/^Acked-by:/d' -e '1s/From \([0-9a-f]\{8\}\)\([0-9a-f]\{32\}\).*/\1/' -e '2s/^From: .*<\([^<@>]*\)@\([^<.>]*\).*/\1@\2/' -e '3s/^Date:[^,]*, \([^ ]*\) \([^ ]*\) 20\([^ ]*\).*/\1-\2\3/' | sed -ne '1h;1d;2x;2G;2h;2d;3x;3G;3h;3d;4x;4G;4s/^\([^\n]*\)\n\([^\n]*\)\n\([^\n]*\)\nSubject: \(.*\)/#id: \1 \2 \3 \4\n\nSubject: \4/;p' | sed -e '3,$s/^/ \0/'; echo; cat "$PROMPT_SFX")" + +# already done: don't do it again. Note that /dev/null is OK +if [ -z "$FORCE" -a -s "$OUTPUT" ]; then + exit 0 +fi + +# In order to rebuild the prompt cache: +# OUTPUT=blah CACHE_RO= ./$0 /dev/null +# +# Note: airoboros is able to carefully isolate an entire context, tests show +# that it's possible to ask it to repeat the entire commit message and it does +# so correctly. However its logic is sometimes bizarre + + +if [ -z "$INTERACTIVE" ]; then + LANG=C "$MAINPROG" --log-disable --model "$MODEL" --threads "$CPU" --ctx_size 4096 --temp 0.36 --top_k 12 --top_p 1 --repeat_last_n 256 --batch_size 16384 --repeat_penalty 1.1 --n_predict 200 --multiline-input --prompt "$PROMPT" --prompt-cache "$CACHE" $CACHE_RO "$@" 2>&1 | grep -v ^llama_model_loader | grep -v ^llm_load_ > "${OUTPUT}" + if [ "$?" != 0 ]; then + # failed: this is likely because the text is too long + (echo "$PROMPT"; echo + echo "Explanation: the commit message was way too long, couldn't analyse it." + echo "Conclusion: uncertain" + echo) > "${OUTPUT}" + fi +else + LANG=C "$MAINPROG" --log-disable --model "$MODEL" --threads "$CPU" --ctx_size 4096 --temp 0.36 --repeat_penalty 1.1 --n_predict 200 --multiline-input --prompt "$PROMPT" --prompt-cache "$CACHE" $CACHE_RO -n -1 -i --color --in-prefix ' ' --reverse-prompt "$INTERACTIVE:" "$@" +fi diff --git a/dev/patchbot/scripts/submit-ai.sh b/dev/patchbot/scripts/submit-ai.sh new file mode 100755 index 0000000..d6c6710 --- /dev/null +++ b/dev/patchbot/scripts/submit-ai.sh @@ -0,0 +1,79 @@ +#!/bin/bash + +# note: the program may re-execute itself: when it has more than one patch to +# process, it will call itself with one patch only in argument. When called +# with a single patch in argument, it will always start the analysis directly. + +# The program uses several environment variables: +# - EXT file name extension for the response +# - MODEL path to the model file (GGUF format) +# - FORCE force to re-process existing patches +# - PROGRAM path to the script to be called +# - CACHE path to the prompt cache (optional) +# - CACHE_RO force cache to remain read-only +# - PROMPT_PFX path to the prompt prefix (before the patch) +# - PROMPT_SFX path to the prompt suffix (after the patch) +# - TOT_CPUS total number of usable CPUs (def: nproc or 1) +# - SLOT_CPUS if defined, it's an array of CPU sets for each running slot +# - CPU_SLOT passed by the first level to the second one to allow binding +# to a specific CPU set based on the slot number from 0 to N-1. + +die() { + [ "$#" -eq 0 ] || echo "$*" >&2 + exit 1 +} + +err() { + echo "$*" >&2 +} + +quit() { + [ "$#" -eq 0 ] || echo "$*" + exit 0 +} + +#### Main + +# detect if running under -x, pass it down to sub-processes +#opt=; set -o | grep xtrace | grep -q on && opt=-x + +USAGE="Usage: ${0##*/} [ -s slots ] patch..." +MYSELF="$0" +TOT_CPUS=${TOT_CPUS:-$(nproc)} +TOT_CPUS=${TOT_CPUS:-1} +SLOTS=1 + + +while [ -n "$1" -a -z "${1##-*}" ]; do + case "$1" in + -s) SLOTS="$2" ; shift 2 ;; + -h|--help) quit "$USAGE" ;; + *) die "$USAGE" ;; + esac +done + +[ -n "$EXT" ] || die "Missing extension name (EXT)" +[ -n "$MODEL" ] || die "Missing model name (MODEL)" +[ -n "$PROGRAM" ] || die "Missing program name (PROGRAM)" +[ -n "$PROMPT_PFX" ] || die "Missing prompt prefix (PROMPT_PFX)" +[ -n "$PROMPT_SFX" ] || die "Missing prompt suffix (PROMPT_SFX)" + +PATCHES=( "$@" ) + +if [ ${#PATCHES[@]} = 0 ]; then + die "$USAGE" +elif [ ${#PATCHES[@]} = 1 ]; then + # really execute + taskset_cmd="" + if [ -n "$CPU_SLOT" ] && [ -n "${SLOT_CPUS[$CPU_SLOT]}" ]; then + taskset_cmd="taskset -c ${SLOT_CPUS[$CPU_SLOT]}" + fi + export CPU=$TOT_CPUS + ${taskset_cmd} ${PROGRAM} "${PATCHES[0]}" +else + # divide CPUs by number of slots + export TOT_CPUS=$(( (TOT_CPUS + SLOTS - 1) / SLOTS )) + # reexecute ourselves in parallel with a single patch each + xargs -n 1 -P "${SLOTS}" --process-slot-var=CPU_SLOT "${MYSELF}" -s 1 <<< "${PATCHES[@]}" +fi + diff --git a/dev/patchbot/scripts/update-3.0.sh b/dev/patchbot/scripts/update-3.0.sh new file mode 100755 index 0000000..5f8ac87 --- /dev/null +++ b/dev/patchbot/scripts/update-3.0.sh @@ -0,0 +1,66 @@ +#!/bin/bash + +SCRIPTS_DIR="$HOME/prog/scripts" +HAPROXY_DIR="$HOME/data/in/haproxy" +PATCHES_PFX="$HOME/data/in/patches" +VERDICT_DIR="$HOME/data/out" +PROMPTS_DIR="$HOME/data/prompts" +MODELS_DIR="$HOME/data/models" +MAINPROG="$HOME/prog/bin/main" + +PARALLEL_RUNS=2 + +BRANCH=$(cd "$HAPROXY_DIR" && git describe --tags HEAD|cut -f1 -d-|cut -f2- -dv) +if [ -z "$BRANCH" ]; then + echo "Couldn't guess current branch, aborting." + exit 1 +fi + +# eg: for v3.0-dev0^ we should get v2.9.0 hence "2.9" +STABLE=$(cd "$HAPROXY_DIR" && git describe --tags "v${BRANCH}-dev0^" |cut -f1,2 -d.|cut -f2- -dv) + +PATCHES_DIR="$PATCHES_PFX"-"$BRANCH" + +(cd "$HAPROXY_DIR" + git pull + last_file=$(ls -1 "$PATCHES_DIR"/*.patch 2>/dev/null | tail -n1) + if [ -n "$last_file" ]; then + restart=$(head -n1 "$last_file" | cut -f2 -d' ') + else + restart="v${BRANCH}-dev0" + fi + "$SCRIPTS_DIR"/mk-patch-list.sh -o "$PATCHES_DIR" -b v${BRANCH}-dev0 $(git log $restart.. --oneline | cut -f1 -d' ') +) + +# List backported fixes (possibly none) +BKP=( + $( + cd "$HAPROXY_DIR" + if ! git remote update "$STABLE"; then + git remote add "$STABLE" "http://git.haproxy.org/git/haproxy-${STABLE}.git/" + git remote update "$STABLE" + fi >&2 + + git log --no-decorate --reverse "v${STABLE}.0..${STABLE}/master" | + sed -ne 's,(cherry picked from commit \(.\{8\}\).*,\1,p' + ) +) + +# by far the best model for now with little uncertain and few wait +echo "${BRANCH}: mistral-7b-v0.2" + +if [ ! -e "${PROMPTS_DIR}/prompt-${BRANCH}-m7bv02.cache" -o "${PROMPTS_DIR}/prompt15-${BRANCH}-mist7bv2-pfx.txt" -nt "${PROMPTS_DIR}/prompt-${BRANCH}-m7bv02.cache" ]; then + echo "Regenerating the prompt cache, may take 1-2 min" + rm -f "${PROMPTS_DIR}/prompt-${BRANCH}-m7bv02.cache" + rm -f empty + touch empty + time EXT=m7bv02.txt MODEL=${MODELS_DIR}/mistral-7b-instruct-v0.2.Q5_K_M.gguf CACHE=${PROMPTS_DIR}/prompt-${BRANCH}-m7bv02.cache CACHE_RO= PROMPT_PFX=${PROMPTS_DIR}/prompt15-${BRANCH}-mist7bv2-pfx.txt PROMPT_SFX=${PROMPTS_DIR}/prompt15-${BRANCH}-mist7bv2-sfx.txt MAINPROG=$MAINPROG PROGRAM="$SCRIPTS_DIR"/process-patch-v15.sh "$SCRIPTS_DIR"/submit-ai.sh empty + rm -f empty empty.m7bv02.txt + echo "Done!" +fi + +# Now process the patches, may take 1-2 hours +time EXT=m7bv02.txt MODEL=${MODELS_DIR}/mistral-7b-instruct-v0.2.Q5_K_M.gguf CACHE=${PROMPTS_DIR}/prompt-${BRANCH}-m7bv02.cache PROMPT_PFX=${PROMPTS_DIR}/prompt15-${BRANCH}-mist7bv2-pfx.txt PROMPT_SFX=${PROMPTS_DIR}/prompt15-${BRANCH}-mist7bv2-sfx.txt MAINPROG=$MAINPROG PROGRAM="$SCRIPTS_DIR"/process-patch-v15.sh "$SCRIPTS_DIR"/submit-ai.sh -s ${PARALLEL_RUNS} ${PATCHES_DIR}/*.patch + +# generate the output, takes 3-5 seconds +"$SCRIPTS_DIR"/post-ai.sh -b "${BKP[*]}" ${PATCHES_DIR}/*.m7bv02.txt > ${VERDICT_DIR}/verdict-${BRANCH}-m7bv02.html diff --git a/dev/phash/phash.c b/dev/phash/phash.c new file mode 100644 index 0000000..8a27405 --- /dev/null +++ b/dev/phash/phash.c @@ -0,0 +1,113 @@ +/* Brute-force based perfect hash generator for small sets of integers. Just + * fill the table below with the integer values, try to pad a little bit to + * avoid too complicated divides, experiment with a few operations in the + * hash function and reuse the output as-is to make your table. You may also + * want to experiment with the random generator to use either one or two + * distinct values for mul and key. + */ + +#include <stdio.h> +#include <stdlib.h> + +/* warning no more than 32 distinct values! */ + +//#define CODES 21 +//#define CODES 20 +//#define CODES 19 +//const int codes[CODES] = { 200,400,401,403,404,405,407,408,410,413,421,422,425,429,500,501,502,503,504}; + +#define CODES 32 +const int codes[CODES] = { 200,400,401,403,404,405,407,408,410,413,421,422,425,429,500,501,502,503,504, + /* padding entries below, which will fall back to the default code */ + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}; + +unsigned mul, xor; +unsigned bmul = 0, bxor = 0; + +static unsigned rnd32seed = 0x11111111U; +static unsigned rnd32() +{ + rnd32seed ^= rnd32seed << 13; + rnd32seed ^= rnd32seed >> 17; + rnd32seed ^= rnd32seed << 5; + return rnd32seed; +} + +/* the hash function to use in the target code. Try various combinations of + * multiplies and xor, always folded with a modulo, and try to spot the + * simplest operations if possible. Sometimes it may be worth adding a few + * dummy codes to get a better modulo code. In this case, just add dummy + * values at the end, but always distinct from the original ones. If the + * number of codes is even, it might be needed to rotate left the result + * before the modulo to compensate for lost LSBs. + */ +unsigned hash(unsigned i) +{ + //return ((i * mul) - (i ^ xor)) % CODES; // more solutions + //return ((i * mul) + (i ^ xor)) % CODES; // alternate + //return ((i ^ xor) * mul) % CODES; // less solutions but still OK for sequences up to 19 long + //return ((i * mul) ^ xor) % CODES; // less solutions but still OK for sequences up to 19 long + + i = i * mul; + i >>= 5; + //i = i ^ xor; + //i = (i << 30) | (i >> 2); // rotate 2 right + //i = (i << 2) | (i >> 30); // rotate 2 left + //i |= i >> 20; + //i += i >> 30; + //i |= i >> 16; + return i % CODES; + //return ((i * mul) ^ xor) % CODES; // less solutions but still OK for sequences up to 19 long +} + +int main(int argc, char **argv) +{ + unsigned h, i, flag, best, tests; + + if (argc > 2) { + mul = atol(argv[1]); + xor = atol(argv[2]); + for (i = 0; i < CODES && codes[i] >= 0; i++) + printf("hash(%4u) = %4u // [%4u] = %4u\n", codes[i], hash(codes[i]), hash(codes[i]), codes[i]); + return 0; + } + + tests = 0; + best = 0; + while (/*best < CODES &&*/ ++tests) { + mul = rnd32(); + xor = mul; // works for some sequences up to 21 long + //xor = rnd32(); // more solutions + + flag = 0; + for (i = 0; i < CODES && codes[i] >= 0; i++) { + h = hash(codes[i]); + if (flag & (1 << h)) + break; + flag |= 1 << h; + } + + if (i > best || + (i == best && mul <= bmul && xor <= bxor)) { + /* find the best code and try to find the smallest + * parameters among the best ones (need to disable + * best<CODES in the loop for this). Small values are + * interesting for some multipliers, and for some RISC + * architectures where literals can be loaded in less + * instructions. + */ + best = i; + bmul = mul; + bxor = xor; + printf("%u: mul=%u xor=%u\n", best, bmul, bxor); + } + + if ((tests & 0x7ffff) == 0) + printf("%u tests...\r", tests); + } + printf("%u tests, %u vals with mul=%u xor=%u:\n", tests, best, bmul, bxor); + + mul = bmul; xor = bxor; + for (i = 0; i < CODES && codes[i] >= 0; i++) + printf("hash(%4u) = %2u // [%2u] = %4u\n", codes[i], hash(codes[i]), hash(codes[i]), codes[i]); +} |