cloverleaf-larry/lib/nc-parse.sh

520 lines
22 KiB
Bash
Executable File

#!/usr/bin/env bash
# nc-parse.sh — first-class native Cloverleaf NetConfig parser for Larry-Anywhere v3.
# Pure bash + awk. No external tools. No v1/v2 dependencies.
#
# The NetConfig is a TCL-style nested-block file with two top-level declarations:
# - process <name> { ... } — process containers
# - protocol <name> { ... } — threads (the operational unit)
#
# This parser exposes structured access to those blocks.
#
# Usage:
# nc-parse.sh <subcommand> <netconfig_path> [args...]
#
# Subcommands:
# list-protocols — one protocol name per line
# list-processes — one process name per line
# protocol-line <NAME> — line number where `protocol NAME {` appears
# protocol-block <NAME> — emit the full TCL block for NAME
# protocol-field <NAME> <FIELD> — emit value of top-level field for NAME
# (e.g. PROCESSNAME, OUTBOUNDONLY, OBWORKASIB)
# protocol-nested <NAME> <PATH> — drill into nested block, e.g. "PROTOCOL.PORT"
# protocol-summary [--all|--filter R] — TSV summary of all protocols with key fields
# destinations <NAME> — list DEST values from DATAXLATE routing block
# sources <NAME> — inverse: protocols that DEST to NAME
# xlate-refs [<NAME>] — list xlate .xlt files referenced
# tclproc-refs [<NAME>] — list TCL proc names referenced
# route-block <NAME> — emit the DATAXLATE block (the routing config)
# index — single-pass route INDEX (P/D/L/O/X
# records) for the in-memory path walker;
# see cmd_index. Parses the file ONCE.
# help — this help
#
# Route-chain PATH enumeration (root-to-leaf chains, all-mode, cross-site) lives
# in lib/nc-paths.sh — it is the single walker backend built on the one-hop
# destinations/sources primitives here. The old `chain` subcommand was removed.
#
# Exit codes: 0 OK, 1 usage error, 2 not found, 3 parse error.
set -u
set -o pipefail
NC_SELF="$0"
die() { printf 'nc-parse: %s\n' "$*" >&2; exit 1; }
require_file() {
[ -f "$1" ] || { printf 'nc-parse: not a file: %s\n' "$1" >&2; exit 2; }
}
# ─────────────────────────────────────────────────────────────────────────────
# Core: emit each top-level block as record `TYPE\tNAME\tSTART_LINE\tEND_LINE`
# Robust to braces nested arbitrarily.
# ─────────────────────────────────────────────────────────────────────────────
_blocks() {
local nc="$1"
awk '
BEGIN { depth=0; in_block=0; type=""; name=""; start=0 }
{
line = $0
if (!in_block && line ~ /^(process|protocol) [A-Za-z0-9_]+ \{$/) {
split(line, a, " ")
type = a[1]
name = a[2]
start = NR
depth = 1
in_block = 1
next
}
if (in_block) {
# count unescaped { and } on this line
n_open = gsub(/\{/, "{", line)
n_close = gsub(/\}/, "}", line)
depth += n_open - n_close
if (depth == 0) {
printf "%s\t%s\t%d\t%d\n", type, name, start, NR
in_block = 0; type=""; name=""; start=0
}
}
}
' "$nc"
}
# ─────────────────────────────────────────────────────────────────────────────
# SINGLE-PASS INDEX (v0.8.20 perf rearchitecture).
#
# Emits, in ONE awk pass over the NetConfig, every fact the path WALKER needs so
# it never has to re-invoke a subprocess or re-parse the file per hop. The old
# walker called nc-parse.sh (destinations/sources/protocol-nested/...) once PER
# HOP PER CANDIDATE, and each of those re-ran _blocks + cmd_protocol_block (two
# full awk passes over a 16K-line file). On the real 24-site integrator that was
# O(threads x parse-cost) = minutes. This subcommand replaces all of it: parse
# ONCE, walk in memory.
#
# Output is a flat TAB-separated record stream (one record per line). The leading
# single-char tag identifies the record kind:
# P <thread> protocol declared in this NetConfig
# D <thread> <dest> a DATAXLATE DEST edge thread->dest
# (handles BOTH `{ DEST name }` and the
# list form `{ DEST {a b c} }`)
# L <thread> <port> a LISTEN port for <thread>:
# server PROTOCOL.PORT (ISSERVER=1) and/or
# the guarded top-level ICLSERVERPORT.
# O <thread> <port> an OUTBOUND/tcpip-client dest port for
# <thread> (PROTOCOL.PORT with ISSERVER!=1).
# X <destname> <site> <thread> <port> a top-level `destination` block: the
# AUTHORITATIVE cross-site link. A DEST that
# names <destname> hops to <thread> in <site>
# (PORT is the connecting port). This is how
# Cloverleaf actually links sites (ICL) — by
# named destination, resolved to SITE+THREAD,
# NOT by blindly matching ports.
#
# Robust to arbitrary brace nesting (same depth bookkeeping as _blocks). DEST,
# ISSERVER, PORT and ICLSERVERPORT are recognised by their canonical one-line
# `{ KEY value }` rendering; absent/`{}` values are simply never emitted (the
# guard that the old paths.tcl lacked).
# ─────────────────────────────────────────────────────────────────────────────
cmd_index() {
local nc="$1"
require_file "$nc"
awk '
BEGIN { depth=0; in_block=0; btype=""; bname=""
in_proto=0; proto_depth=0; pport=""; isserver=""; iclport=""
dsite=""; dthread=""; dport="" }
# ---- enter a top-level block -------------------------------------------
!in_block && $0 ~ /^(process|protocol|destination) [A-Za-z0-9_]+ \{$/ {
split($0, a, " ")
btype = a[1]; bname = a[2]
depth = 1; in_block = 1
in_proto = 0; proto_depth = 0; pport=""; isserver=""; iclport=""
dsite=""; dthread=""; dport=""
if (btype == "protocol") print "P\t" bname
next
}
in_block {
line = $0
# --- field extraction BEFORE we mutate depth (fields are depth-1 inside
# their parent block; the value is the whole `{ KEY v }` on one line) ---
if (btype == "protocol") {
# DEST single: { DEST name }
if (match(line, /\{ DEST [A-Za-z0-9_]+ \}/)) {
v = substr(line, RSTART+7, RLENGTH-9) # strip "{ DEST " .. " }"
print "D\t" bname "\t" v
}
# DEST list: { DEST {a b c} }
else if (match(line, /\{ DEST \{[^}]*\}/)) {
v = substr(line, RSTART+8, RLENGTH-9) # strip "{ DEST {" .. "}"
m = split(v, dd, /[ \t]+/)
for (i=1; i<=m; i++) if (dd[i] != "") print "D\t" bname "\t" dd[i]
}
# top-level ICLSERVERPORT (a listen port, guarded numeric)
if (match(line, /^[[:space:]]+\{ ICLSERVERPORT [0-9]+ \}[[:space:]]*$/)) {
v = line; sub(/^[[:space:]]+\{ ICLSERVERPORT /, "", v); sub(/ \}[[:space:]]*$/, "", v)
iclport = v
}
# enter the nested { PROTOCOL { ... } } sub-block
if (!in_proto && line ~ /^[[:space:]]+\{ PROTOCOL \{$/) {
in_proto = 1; proto_depth = depth + 1
} else if (in_proto) {
if (match(line, /^[[:space:]]+\{ PORT [0-9]+ \}[[:space:]]*$/)) {
v = line; sub(/^[[:space:]]+\{ PORT /, "", v); sub(/ \}[[:space:]]*$/, "", v)
pport = v
}
if (match(line, /^[[:space:]]+\{ ISSERVER [0-9]+ \}[[:space:]]*$/)) {
v = line; sub(/^[[:space:]]+\{ ISSERVER /, "", v); sub(/ \}[[:space:]]*$/, "", v)
isserver = v
}
}
} else if (btype == "destination") {
if (match(line, /^[[:space:]]+\{ SITE [A-Za-z0-9_]+ \}[[:space:]]*$/)) {
v = line; sub(/^[[:space:]]+\{ SITE /, "", v); sub(/ \}[[:space:]]*$/, "", v); dsite = v
}
if (match(line, /^[[:space:]]+\{ THREAD [A-Za-z0-9_]+ \}[[:space:]]*$/)) {
v = line; sub(/^[[:space:]]+\{ THREAD /, "", v); sub(/ \}[[:space:]]*$/, "", v); dthread = v
}
if (match(line, /^[[:space:]]+\{ PORT [0-9]+ \}[[:space:]]*$/)) {
v = line; sub(/^[[:space:]]+\{ PORT /, "", v); sub(/ \}[[:space:]]*$/, "", v); dport = v
}
}
# --- depth bookkeeping ---
n_open = gsub(/\{/, "{", line)
n_close = gsub(/\}/, "}", line)
depth += n_open - n_close
if (in_proto && depth < proto_depth) in_proto = 0
# --- close the top-level block: emit its aggregate records ---
if (depth == 0) {
if (btype == "protocol") {
# listen ports: server PROTOCOL.PORT (ISSERVER=1) and/or ICL port
if (isserver == "1" && pport != "") print "L\t" bname "\t" pport
if (iclport != "") print "L\t" bname "\t" iclport
# outbound/tcpip-client dest port
if (isserver != "1" && pport != "") print "O\t" bname "\t" pport
} else if (btype == "destination") {
if (dsite != "" && dthread != "")
print "X\t" bname "\t" dsite "\t" dthread "\t" dport
}
in_block = 0; btype=""; bname=""
}
}
' "$nc"
}
cmd_list_protocols() {
local nc="$1"
require_file "$nc"
_blocks "$nc" | awk -F'\t' '$1=="protocol"{print $2}'
}
cmd_list_processes() {
local nc="$1"
require_file "$nc"
_blocks "$nc" | awk -F'\t' '$1=="process"{print $2}'
}
cmd_protocol_line() {
local nc="$1" name="$2"
require_file "$nc"
_blocks "$nc" | awk -F'\t' -v n="$name" '$1=="protocol" && $2==n {print $3}'
}
cmd_protocol_block() {
local nc="$1" name="$2"
require_file "$nc"
local range; range=$(_blocks "$nc" | awk -F'\t' -v n="$name" '$1=="protocol" && $2==n {print $3","$4}')
[ -z "$range" ] && { printf 'nc-parse: no such protocol: %s\n' "$name" >&2; exit 2; }
awk -v range="$range" 'BEGIN{split(range,r,",")} NR>=r[1] && NR<=r[2]' "$nc"
}
# Top-level fields are lines like: ` { FIELD value }` at depth 1 inside the protocol block.
# Strip surrounding `{ ... }` and emit value(s).
cmd_protocol_field() {
local nc="$1" name="$2" field="$3"
require_file "$nc"
cmd_protocol_block "$nc" "$name" \
| awk -v F="$field" '
BEGIN { depth = 0 }
{
line = $0
n_open = gsub(/\{/, "{", line)
n_close = gsub(/\}/, "}", line)
# before applying deltas, the previous depth is what we use to test
# for "this line is a depth-1 field-statement"
prev = depth
depth += n_open - n_close
# A top-level field is at indent depth==1 BEFORE this line opens any
# further blocks. It looks like: { FIELD value }
# (entire content on one line). We match exactly that.
if (prev == 1 && line ~ "^[[:space:]]+\\{ " F " ") {
# strip leading " { F " and trailing " }"
sub("^[[:space:]]+\\{ " F " ", "", line)
sub(" \\}$", "", line)
print line
}
}
'
}
# Drill into nested blocks. e.g. protocol-nested NAME PROTOCOL.PORT
# Walks the nested { KEY { ... } } structure.
cmd_protocol_nested() {
local nc="$1" name="$2" path="$3"
require_file "$nc"
IFS='.' read -ra parts <<< "$path"
local block; block=$(cmd_protocol_block "$nc" "$name") || return 1
local current="$block"
local i
for ((i=0; i<${#parts[@]}; i++)); do
local key="${parts[$i]}"
if [ $((i+1)) -eq ${#parts[@]} ]; then
# Last part: extract scalar value.
# Baseline depth depends on whether we drilled (body has no wrapper, prev=0)
# or we're at the protocol-block level (has `protocol NAME {` wrapper, prev=1).
local baseline=1
[ $i -gt 0 ] && baseline=0
printf '%s\n' "$current" | awk -v K="$key" -v BASE="$baseline" '
BEGIN { depth = 0 }
{
line = $0
n_open = gsub(/\{/, "{", line)
n_close = gsub(/\}/, "}", line)
prev = depth
depth += n_open - n_close
if (prev == BASE && line ~ "^[[:space:]]+\\{ " K " ") {
sub("^[[:space:]]+\\{ " K " ", "", line)
sub(" \\}$", "", line)
print line
}
}
'
else
# Drill: find `{ KEY {` opening, capture body until matching `} }`
current=$(printf '%s\n' "$current" | awk -v K="$key" '
BEGIN { depth=0; capturing=0; cap_depth=0 }
{
line = $0
n_open = gsub(/\{/, "{", line)
n_close = gsub(/\}/, "}", line)
if (!capturing && line ~ "^[[:space:]]+\\{ " K " \\{$") {
capturing = 1
cap_depth = depth + 1 # the new opening { just hit
depth += n_open - n_close
next
}
if (capturing) {
depth += n_open - n_close
if (depth < cap_depth) {
capturing = 0
exit
}
print
} else {
depth += n_open - n_close
}
}
')
[ -z "$current" ] && { printf 'nc-parse: no nested key %s under %s\n' "$key" "$name" >&2; exit 2; }
fi
done
}
# Compact one-line summary per protocol. TSV.
cmd_protocol_summary() {
local nc="$1"; shift
local filter=""
while [ $# -gt 0 ]; do
case "$1" in
--all) filter="" ;;
--filter) shift; filter="$1" ;;
*) die "unknown summary flag: $1" ;;
esac
shift
done
require_file "$nc"
# Print TSV header
printf "name\tprocess\tdirection\tport\thost\ttype\tisserver\toutonly\tobworkasib\ticlserverport\n"
local names
names=$(cmd_list_protocols "$nc")
local n
for n in $names; do
if [ -n "$filter" ] && ! printf '%s' "$n" | grep -Eq -- "$filter"; then
continue
fi
local pname obib outonly iclserv ptype phost pport isserver direction
pname=$(cmd_protocol_field "$nc" "$n" PROCESSNAME | head -1)
obib=$(cmd_protocol_field "$nc" "$n" OBWORKASIB | head -1)
outonly=$(cmd_protocol_field "$nc" "$n" OUTBOUNDONLY | head -1)
iclserv=$(cmd_protocol_field "$nc" "$n" ICLSERVERPORT | head -1)
ptype=$(cmd_protocol_nested "$nc" "$n" PROTOCOL.TYPE 2>/dev/null | head -1)
phost=$(cmd_protocol_nested "$nc" "$n" PROTOCOL.HOST 2>/dev/null | head -1)
pport=$(cmd_protocol_nested "$nc" "$n" PROTOCOL.PORT 2>/dev/null | head -1)
isserver=$(cmd_protocol_nested "$nc" "$n" PROTOCOL.ISSERVER 2>/dev/null | head -1)
# Direction inference
if [ "$isserver" = "1" ]; then
direction="inbound-tcp-listen"
elif [ "$obib" = "1" ]; then
direction="inbound-icl-or-file"
elif [ "$outonly" = "1" ]; then
direction="outbound"
else
direction="unknown"
fi
# Clean braces from values
phost=$(printf '%s' "$phost" | sed 's/^{}$//; s/^{//; s/}$//')
pport=$(printf '%s' "$pport" | sed 's/^{}$//; s/^{//; s/}$//')
printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" \
"$n" "${pname:-}" "$direction" "${pport:-}" "${phost:-}" \
"${ptype:-}" "${isserver:-}" "${outonly:-}" "${obib:-}" "${iclserv:-}"
done
}
# Destinations: walk DATAXLATE > ROUTE_DETAILS > { DEST <name> }
cmd_destinations() {
local nc="$1" name="$2"
require_file "$nc"
cmd_protocol_block "$nc" "$name" \
| awk '
/\{ DEST [A-Za-z0-9_]+ \}/ {
sub(/.*\{ DEST /, "")
sub(/ \}.*$/, "")
print
}
' | sort -u
}
# Xlate refs: every X.xlt name appearing in the protocol's block (or all if no name)
cmd_xlate_refs() {
local nc="$1" name="${2:-}"
require_file "$nc"
if [ -n "$name" ]; then
cmd_protocol_block "$nc" "$name" | grep -oE '[A-Za-z0-9_]+\.xlt' | sort -u
else
grep -oE '[A-Za-z0-9_]+\.xlt' "$nc" | sort -u
fi
}
# Sources: every protocol that has `{ DEST <target> }` in its body.
# Slower than _blocks because it scans each protocol's body, but for a 48-thread
# site it's still sub-second.
cmd_sources() {
local nc="$1" target="$2"
require_file "$nc"
local names; names=$(cmd_list_protocols "$nc")
local n
for n in $names; do
[ "$n" = "$target" ] && continue
if cmd_protocol_block "$nc" "$n" 2>/dev/null | grep -qE "\\{ DEST $target \\}"; then
printf '%s\n' "$n"
fi
done
}
# Tclproc references — extract every TCL proc name referenced in this protocol's
# block (DATAFORMAT.PROC singletons + PROCS clauses with one or more names).
# Excludes empty {} and the bare keyword PROCSCONTROL.
cmd_tclproc_refs() {
local nc="$1" name="${2:-}"
require_file "$nc"
local body
if [ -n "$name" ]; then
body=$(cmd_protocol_block "$nc" "$name" 2>/dev/null)
else
body=$(cat "$nc")
fi
printf '%s\n' "$body" | awk '
{
line = $0
# PROC <name> (singleton, e.g. DATAFORMAT.PROC)
if (match(line, /\{ PROC [A-Za-z_][A-Za-z0-9_]*/)) {
v = substr(line, RSTART + 7, RLENGTH - 7)
print v
}
# PROCS <name> (singleton)
if (match(line, /\{ PROCS [A-Za-z_][A-Za-z0-9_]*/)) {
v = substr(line, RSTART + 8, RLENGTH - 8)
print v
}
# PROCS { name1 name2 ... } (list — rare but possible)
if (match(line, /\{ PROCS \{ [^}]+\}/)) {
v = substr(line, RSTART + 9, RLENGTH - 9)
sub(/ *\}$/, "", v)
n = split(v, arr, /[ \t]+/)
for (i=1; i<=n; i++) if (arr[i] != "") print arr[i]
}
}
' | sort -u | grep -v '^$'
}
# NOTE (v0.8.19): the old `cmd_chain` BFS-node-set walker was removed and
# CONSOLIDATED into lib/nc-paths.sh, which is now the SINGLE route-chain backend.
# cmd_chain only emitted a flat set of reachable nodes (depth/direction/thread),
# never enumerated root-to-leaf PATHS, was never wired into the LLM, and would
# have left two competing walkers. nc-paths.sh ports the v2 `paths` DFS
# enumerator (SITE/THREAD/HOPS/PATH output, all-mode, PORT-based cross-site links)
# and reuses the one-hop DEST primitives (cmd_destinations / cmd_sources) below
# for intra-site routing. Do not reintroduce a second walker here — extend
# nc-paths.sh.
cmd_route_block() {
local nc="$1" name="$2"
require_file "$nc"
cmd_protocol_block "$nc" "$name" \
| awk '
BEGIN { depth=0; capturing=0; cap_depth=0 }
{
line = $0
n_open = gsub(/\{/, "{", line)
n_close = gsub(/\}/, "}", line)
if (!capturing && line ~ /^[[:space:]]+\{ DATAXLATE \{$/) {
capturing = 1
cap_depth = depth + 1
print
depth += n_open - n_close
next
}
if (capturing) {
print
depth += n_open - n_close
if (depth < cap_depth) exit
} else {
depth += n_open - n_close
}
}
'
}
cmd_help() { sed -n '2,30p' "$NC_SELF"; }
# ─────────────────────────────────────────────────────────────────────────────
# Dispatch
# ─────────────────────────────────────────────────────────────────────────────
SUB="${1:-help}"
case "$SUB" in
list-protocols) [ $# -ge 2 ] || die "usage: $0 list-protocols <netconfig>"; cmd_list_protocols "$2" ;;
list-processes) [ $# -ge 2 ] || die "usage: $0 list-processes <netconfig>"; cmd_list_processes "$2" ;;
protocol-line) [ $# -ge 3 ] || die "usage: $0 protocol-line <netconfig> <name>"; cmd_protocol_line "$2" "$3" ;;
protocol-block) [ $# -ge 3 ] || die "usage: $0 protocol-block <netconfig> <name>"; cmd_protocol_block "$2" "$3" ;;
protocol-field) [ $# -ge 4 ] || die "usage: $0 protocol-field <netconfig> <name> <field>"; cmd_protocol_field "$2" "$3" "$4" ;;
protocol-nested) [ $# -ge 4 ] || die "usage: $0 protocol-nested <netconfig> <name> <dotted.path>"; cmd_protocol_nested "$2" "$3" "$4" ;;
protocol-summary) [ $# -ge 2 ] || die "usage: $0 protocol-summary <netconfig> [--filter REGEX]"; cmd_protocol_summary "$2" "${@:3}" ;;
destinations) [ $# -ge 3 ] || die "usage: $0 destinations <netconfig> <name>"; cmd_destinations "$2" "$3" ;;
sources) [ $# -ge 3 ] || die "usage: $0 sources <netconfig> <name>"; cmd_sources "$2" "$3" ;;
chain) die "the 'chain' subcommand was removed in v0.8.19 — use nc-paths.sh (route-chain path enumerator) instead" ;;
xlate-refs) [ $# -ge 2 ] || die "usage: $0 xlate-refs <netconfig> [name]"; cmd_xlate_refs "$2" "${3:-}" ;;
tclproc-refs) [ $# -ge 2 ] || die "usage: $0 tclproc-refs <netconfig> [name]"; cmd_tclproc_refs "$2" "${3:-}" ;;
route-block) [ $# -ge 3 ] || die "usage: $0 route-block <netconfig> <name>"; cmd_route_block "$2" "$3" ;;
index) [ $# -ge 2 ] || die "usage: $0 index <netconfig>"; cmd_index "$2" ;;
help|-h|--help) cmd_help ;;
*) die "unknown subcommand: $SUB (try '$0 help')" ;;
esac