aboutsummaryrefslogblamecommitdiff
path: root/scripts/run.sh
blob: aa8fca121af2d8af90fad8133e55e79b1b5ca5d2 (plain) (tree)
1
2
3
4
5
6
7





                                                       
                                               
















                                                                                                                                                      
             




























































































































































































                                                                                                  
                                     
                                                     





                                     



















                                                                                   





                                            

        



                                     









                                                                  





                                            








                                            


                                   















                                                                                                         




                                     





                                                                                                           





                                            



























                                                                                                         




                                     






                                                                                                         





                                            











































































































































                                                                                                                                         
#!/bin/bash

proj_server_bin="hotstuff-app"
proj_server_path="/home/ted/hot-stuff/$proj_server_bin"
proj_conf_name="hotstuff.conf"

peer_list="./nodes.txt"     # the list of nodes
conf_src="./hotstuff.gen.conf"
server_map="./server_map.txt"         # optional mapping from node ip to server ip
template_dir="template"     # the dir that keeps the content shared among all nodes
remote_base="/home/ted/testbed"  # remote dir used to keep files for the experiment
#remote_base="/tmp/"  # remote dir used to keep files for the experiment
remote_log="log"   # log filename
remote_user="ted"
copy_to_remote_pat="rsync -avz <local_path> <remote_user>@<remote_ip>:<remote_path>"
copy_from_remote_pat="rsync -avz <remote_user>@<remote_ip>:<remote_path> <local_path>"
exe_remote_pat="ssh <remote_user>@<remote_ip> bash"
run_remote_pat="cd \"<rworkdir>\"; gdb -ex r -ex bt -ex generate-core-file -ex q --args '$proj_server_path' --conf \"hotstuff.gen-sec<node_id>.conf\""
reset_remote_pat="pgrep -f '$proj_server_bin' | xargs kill -9"

fin_keyword="error:"  # the keyword indicating completion of execution
fin_chk_period=1
fin_chk_skip_pat='^([A-O][0-9]*)|(_ctl)$'
force_peer_list=0
async_num=128

function join { local IFS="$1"; shift; echo "$*"; }
function split {
    local IFS="$1"
    local arr=($2)
    echo "${arr[@]}"
}

function die { echo "$1"; exit 1; }

declare -A nodes
declare -A node_confs
nodes_cnt=0
function get_node_info {
    pl="$1"
    if [[ "$force_peer_list" == 1 ]]; then
        pl="$peer_list"
    fi
    OIFS="$IFS"
    IFS=$'\n'
    node_list=($(cat "$pl"))
    IFS="$OIFS"
    for tuple in "${node_list[@]}"; do
        tup0=($(split $'\t' "$tuple"))
        tup=($(split : "${tup0[0]}"))
        nodes[${tup[0]}]="${tup[1]}:${tup[2]}"
        node_confs[${tup[0]}]="${tup0[@]:1}"
        echo "${tup[0]} => ${nodes[${tup[0]}]} & ${node_confs[${tup[0]}]}"
        let nodes_cnt++
    done
}

declare -A server_map
function get_server_map {
    {
        IFS=$'\n'
        map_list=($(cat "$1"))
    }
    IFS=$'\n \t'
    for pair in "${map_list[@]}"; do
        p=($pair)
        server_map[${p[0]}]="${p[1]}"
        echo "mapping ${p[0]} => ${p[1]}"
    done
}


function get_addr {
    tup=($(split ';' $1))
    echo "${tup[0]}"
}

function get_ip {
    tup=($(split : $1))
    echo "${tup[0]}"
}

function get_peer_port {
    tup=($(split : $1))
    tup2=($(split ';' ${tup[1]}))
    echo "${tup2[0]}"
}


function get_client_port {
    tup=($(split : $1))
    tup2=($(split ';' ${tup[1]}))
    echo "${tup2[1]}"
}


function get_ip_by_id {
    get_ip "${nodes[$1]}"
}

function get_peer_port_by_id {
    get_peer_port "${nodes[$1]}"
}


function get_client_port_by_id {
    get_client_port "${nodes[$1]}"
}

function copy_file {
    local pat="$1"
    local cmd="${pat//<local_path>/$2}"
    cmd="${cmd//<remote_ip>/$3}"
    cmd="${cmd//<remote_user>/$remote_user}"
    cmd="${cmd//<remote_path>/$4}"
    echo $cmd
    eval "$cmd"
} >> log 2>&1

function execute_remote_cmd_pid {
    local node_ip="$1"
    local c="$2"
    local l="$3"
    local cmd="${exe_remote_pat//<remote_ip>/$node_ip}"
    cmd="${cmd//<remote_user>/$remote_user}"
    eval $cmd << EOF
$c > $l 2>&1 & echo \$!
EOF
}



function execute_remote_cmd_stat {
    local node_ip="$1"
    local c="$2"
    local l="$3"
    local cmd="${exe_remote_pat//<remote_ip>/$node_ip}"
    cmd="${cmd//<remote_user>/$remote_user}"
    eval $cmd << EOF
$c > $l 2>&1 ; echo \$?
EOF
}


function _remote_load {
    local workdir="$1"
    local rworkdir="$2"
    local node_ip="$3"
    local rid="$4"
    local extra_conf=($5)
    local tmpldir="$workdir/$template_dir/"
    local node_tmpldir="$workdir/$rid"
    [[ $(execute_remote_cmd_stat "$node_ip" \
        "mkdir -p \"$rworkdir\"" \
        /dev/null) == 0 ]] || die "failed to create directory $rworkdir"
    copy_file "$copy_to_remote_pat" "$tmpldir" "$node_ip" "$rworkdir"
    for conf in "${extra_conf[@]}"; do
        copy_file "$copy_to_remote_pat" "$node_tmpldir/$conf" "$node_ip" "$rworkdir"
    done
}

function _remote_start {
    local workdir="$1"
    local rworkdir="$2"
    local node_id="$3"
    local node_ip="$4"
    local client_port="$5"
    local cmd="${run_remote_pat//<rworkdir>/$rworkdir}"
    cmd="${cmd//<node_id>/$node_id}"
    cmd="${cmd//<cport>/$client_port}"
    execute_remote_cmd_pid "$node_ip" "$cmd" \
        "\"$rworkdir/$remote_log\"" > "$workdir/${node_id}.pid"
}

function _remote_exec {
    local workdir="$1"
    local rworkdir="$2"
    local node_ip="$3"
    local cmd="$4"
    [[ $(execute_remote_cmd_stat "$node_ip" "$cmd" /dev/null) == 0 ]]
}

function _remote_stop {
    local node_pid="$4"
    _remote_exec "$1" "$2" "$3" "kill $node_pid"
}

function _remote_status {
    local node_pid="$4"
    _remote_exec "$1" "$2" "$3" "kill -0 $node_pid"
}

function _remote_finished {
    _remote_exec "$1" "$2" "$3" "grep \"$fin_keyword\" \"$rworkdir/$remote_log\""
}

function _remote_fetch {
    local workdir="$1"
    local rworkdir="$2"
    local node_id="$3"
    local node_ip="$4"
    copy_file "$copy_from_remote_pat" "$workdir/${node_id}.log" "$node_ip" "$rworkdir/$remote_log"
}

function start_all {
    local workdir="$1"
    local tmpldir="$workdir/$template_dir/"
    mkdir "$workdir" > /dev/null 2>&1 || die "workdir already exists"
    rm -rf "$tmpldir"
    mkdir "$tmpldir"
    cp "$peer_list" "$workdir/peer_list.txt"
    cp "$server_map" "$workdir/server_map.txt"
    get_node_info "$workdir/peer_list.txt"
    get_server_map "$workdir/server_map.txt"
    echo "copying configuration file"
    rsync -avP "$conf_src" "$tmpldir/$proj_conf_name"
    echo "${node_list[@]}"
    cnt="${#nodes[@]}"
    #for rid in "${!nodes[@]}"; do
    rid=0
    c=0
    while [[ "$rid" -lt "$cnt" ]]; do
        local node_tmpldir="$workdir/$rid"
        local ip="$(get_ip_by_id $rid)"
        ip="${server_map[$ip]:-$ip}"
        local pport="$(get_peer_port_by_id $rid)"
        local cport="$(get_client_port_by_id $rid)"
        local rworkdir="$remote_base/$workdir/${rid}"
        local extra_conf_=(${node_confs[$rid]})
        rm -rf "$node_tmpldir"
        mkdir "$node_tmpldir"
        (
        local extra_conf=()
        for conf in "${extra_conf_[@]}"; do
            cp "$conf" "$node_tmpldir/"
            extra_conf+=($(basename "$conf"))
            copy_file "$copy_to_remote_pat" "$tmpldir/$conf" "$node_ip" "$rworkdir"
        done
        echo "Starting $rid @ $ip, $pport and $cport"
        _remote_load "$workdir" "$rworkdir" "$ip" "$rid" "${extra_conf[@]}"
        echo "$rid loaded"
        ) &
        let rid++
        let c++
        if [[ "$c" -eq "$async_num" ]]; then
            c=0
            wait
        fi
    done
    wait
    rid=0
    c=0
    #for rid in "${!nodes[@]}"; do
    while [[ "$rid" -lt "$cnt" ]]; do
        local ip="$(get_ip_by_id $rid)"
        ip="${server_map[$ip]:-$ip}"
        local pport="$(get_peer_port_by_id $rid)"
        local cport="$(get_client_port_by_id $rid)"
        local rworkdir="$remote_base/$workdir/${rid}"
        (
        echo "Starting $rid @ $ip, $pport and $cport"
        _remote_start "$workdir" "$rworkdir" "$rid" "$ip" "$cport"
        echo "$rid started"
        ) &
        let rid++
        let c++
        if [[ "$c" -eq "$async_num" ]]; then
            c=0
            wait
        fi
    done
    wait
}

function fetch_all {
    local workdir="$1"
    get_node_info "$workdir/peer_list.txt"
    get_server_map "$workdir/server_map.txt"
    for rid in "${!nodes[@]}"; do
        #if [[ "$rid" != 0 ]]; then
        #    continue
        #fi
        local ip="$(get_ip_by_id $rid)"
        ip="${server_map[$ip]:-$ip}"
        local port="$(get_peer_port_by_id $rid)"
        local rworkdir="$remote_base/$workdir/${rid}"
        local pid="$(cat $workdir/${rid}.pid)"
        local msg="Fetching $rid @ $ip, $port "
        _remote_fetch "$workdir" "$rworkdir" "$rid" "$ip" && echo "$msg: copied" || echo "$msg: failed" &
    done
    wait
}

function exec_all {
    local workdir="$1"
    local cmd="$2"
    get_node_info "$workdir/peer_list.txt"
    get_server_map "$workdir/server_map.txt"
    cnt="${#nodes[@]}"
    rid=0
    c=0
    #for rid in "${!nodes[@]}"; do
    while [[ "$rid" -lt "$cnt" ]]; do
        local ip="$(get_ip_by_id $rid)"
        ip="${server_map[$ip]:-$ip}"
        local port="$(get_peer_port_by_id $rid)"
        local rworkdir="$remote_base/$workdir/${rid}"
        local msg="Executing $rid @ $ip, $port "
        _remote_exec "$workdir" "$rworkdir" "$ip" "$cmd" && echo "$msg: succeeded" || echo "$msg: failed" &
        let rid++
        let c++
        if [[ "$c" -eq "$async_num" ]]; then
            c=0
            wait
        fi
    done
    wait
}

function reset_all {
    exec_all "$1" "$reset_remote_pat"
}

function stop_all {
    local workdir="$1"
    get_node_info "$workdir/peer_list.txt"
    get_server_map "$workdir/server_map.txt"
    for rid in "${!nodes[@]}"; do
        local ip="$(get_ip_by_id $rid)"
        ip="${server_map[$ip]:-$ip}"
        local port="$(get_peer_port_by_id $rid)"
        local rworkdir="$remote_base/$workdir/${rid}"
        local pid="$(cat $workdir/${rid}.pid)"
        local msg="Killing $rid @ $ip, $port "
        _remote_stop "$workdir" "$rworkdir" "$ip" "$pid" && echo "$msg: stopped" || echo "$msg: failed" &
    done
    wait
}

function status_all {
    local workdir="$1"
    get_node_info "$workdir/peer_list.txt"
    get_server_map "$workdir/server_map.txt"
    cnt="${#nodes[@]}"
    rid=0
    c=0
    #for rid in "${!nodes[@]}"; do
    while [[ "$rid" -lt "$cnt" ]]; do
        local ip="$(get_ip_by_id $rid)"
        ip="${server_map[$ip]:-$ip}"
        local port="$(get_peer_port_by_id $rid)"
        local rworkdir="$remote_base/$workdir/${rid}"
        local pid="$(cat $workdir/${rid}.pid)"
        local msg="$rid @ $ip, $port "
        _remote_status "$workdir" "$rworkdir" "$ip" "$pid" && echo "$msg: running" || echo "$msg: dead" &
        let rid++
        let c++
        if [[ "$c" -eq "$async_num" ]]; then
            c=0
            wait
        fi
    done
    wait
}

function finished_all {
    local workdir="$1"
    get_node_info "$workdir/peer_list.txt"
    get_server_map "$workdir/server_map.txt"
    for rid in "${!nodes[@]}"; do
        local ip="$(get_ip_by_id $rid)"
        ip="${server_map[$ip]:-$ip}"
        local port="$(get_peer_port_by_id $rid)"
        local rworkdir="$remote_base/$workdir/${rid}"
        if [[ "$rid" =~ $fin_chk_skip_pat ]]; then
            continue
        fi
        printf "$rid @ $ip, $port "
        _remote_finished "$workdir" "$rworkdir" "$ip" && echo "finished" || echo "in-progress"
    done
}

function wait_all {
    local workdir="$1"
    get_node_info "$workdir/peer_list.txt"
    get_server_map "$workdir/server_map.txt"
    while true; do
        finished=1
        printf "checking the nodes..."
        for rid in "${!nodes[@]}"; do
            local ip="$(get_ip_by_id $rid)"
            ip="${server_map[$ip]:-$ip}"
            local port="$(get_peer_port_by_id $rid)"
            local rworkdir="$remote_base/$workdir/${rid}"
            if [[ "$rid" =~ $fin_chk_skip_pat ]]; then
                continue
            fi
            if ! _remote_finished "$workdir" "$rworkdir" "$ip"; then
                finished=0
                break
            fi
        done
        if [[ $finished == 1 ]]; then
            break
        fi
        echo "not finished yet, wait for $fin_chk_period secs"
        sleep "$fin_chk_period"
    done
    echo "finished"
}

function check_all {
    status_all "$1" | grep dead -q
    [[ "$?" -eq 0 ]] && die "some nodes are dead"
    echo "ok"
}

function print_help {
echo "Usage: $0 [--bin] [--path] [--conf] [--conf-src] [--peer-list] [--server-map] [--user] [--force-peer-list] [--help] COMMAND WORKDIR

    --help                      show this help and exit
    --bin                       name of binary executable
    --path                      path to the binary
    --conf                      shared configuration filename
    --conf-src                  shared configuration source file
    --peer-list FILE            read peer list from FILE (default: $peer_list)
    --server-map FILE           read server map from FILE (default: $server_map)
    --user      USER            the username to login the remote machines
    --force-peer-list           force the use of FILE specified by --peer-list
                                instead of the peer list in WORKDIR"
    exit 0
}

function check_argnum {
    argnum=$(($# - 1))
    [[ "$1" -eq "$argnum" ]] || die "incorrect argnum: got $argnum, $1 expected"
}

getopt --test > /dev/null
[[ $? -ne 4 ]] && die "getopt unsupported"

SHORT=
LONG='\
bin:,path:,conf:,conf-src:,\
peer-list:,\
server-map:,\
remote-base:,\
remote-user:,\
copy-to-remote-pat:,\
copy-from-remote-pat:,\
exe-remote-pat:,\
run-remote-pat:,\
reset-remote-pat:,\
fin-keyword:,\
fin-chk-period:,\
fin-chk-skip-pat:,\
force-peer-list,\
help'

PARSED=$(getopt --options "$SHORT" --longoptions "$LONG" --name "$0" -- "$@")
[[ $? -ne 0 ]] && exit 1
eval set -- "$PARSED"

while true; do
    case "$1" in
        --bin) proj_server_bin="$2"; shift 2;;
        --path) proj_server_path="$2"; shift 2;;
        --conf) proj_conf_name="$2"; shift 2;;
        --conf-src) conf_src="$2"; shift 2;;
        --peer-list) peer_list="$2"; shift 2;;
        --server-map) server_map="$2"; shift 2;;
        --remote-base) remote_base="$2"; shift 2;;
        --remote-user) remote_user="$2"; shift 2;;
        --copy-to-remote-pat) copy_to_remote_pat="$2"; shift 2;;
        --copy-from-remote-pat) copy_from_remote_pat="$2"; shift 2;;
        --exe-remote-pat) exe_remote_pat="$2"; shift 2;;
        --run-remote-pat) run_remote_pat="$2"; shift 2;;
        --reset-remote-pat) reset_remote_pat="$2"; shift 2;;
        --fin-keyword) fin_keyword="$2"; shift 2;;
        --fin-chk-period) fin_chk_period="$2"; shift 2;;
        --fin-chk-skip-pat) fin_chk_skip_pat="$2"; shift 2;;
        --force-peer-list) force_peer_list=1; shift 1;;
        --help) print_help; shift 1;;
        --) shift; break;;
        *) die "internal error";;
    esac
done
cmd="$1"
shift 1
case "$cmd" in
    start) check_argnum 1 "$@" && start_all "$1" ;;
    stop) check_argnum 1 "$@" && stop_all "$1" ;;
    status) check_argnum 1 "$@" && status_all "$1" ;;
    check) check_argnum 1 "$@" && check_all "$1" ;;
    finished) check_argnum 1 "$@" && finished_all "$1" ;;
    fetch) check_argnum 1 "$@" && fetch_all "$1" ;;
    wait) check_argnum 1 "$@" && wait_all "$1" ;;
    reset) check_argnum 1 "$@" && reset_all "$1" ;;
    exec) check_argnum 2 "$@" && exec_all "$1" "$2" ;;
    *) print_help ;;
esac