Verification
Verify My Data
I publish a single Merkle root that commits to every byte of the trillion-bracket dataset.
You can verify the root or prove that any specific bracket (8-byte record) is included, without re-downloading everything.
What I publish
manifest.json — shard list and sizes (records are uint64, 63 bits used).
- Shards:
s00000.u64, s00001.u64, … — raw 8-byte records.
chunk_hashes.bin — contiguous 32-byte BLAKE3 hashes for each 8 MiB chunk of data.
merkle_tree.bin — all levels of the Merkle tree (leaves first) ending with the root.
commitment.json — algorithm, parameters, and the root_hex you should compare against.
- (Optional)
proofs/ — JSON inclusion proofs per chunk for fast, offline verification.
Hashing uses BLAKE3 when available, otherwise BLAKE2b-256.
A) Spot-check a record (recommended)
You need four things: (1) the published commitment.json, (2) the record index you want to check,
(3) the chunk that contains that record, and (4) the Merkle proof for that chunk.
The script below downloads only one chunk (8 MiB) plus a tiny proof (~1 KiB).
Python ≥3.10
# verify_record.py
# Usage: python verify_record.py --data-root <URL-or-path> --record-index 123456789
# Example data layout at <DATA_ROOT>:
# manifest.json, commitment.json, chunk_hashes.bin, merkle_tree.bin, shards/, proofs/
import argparse, json, math, os, sys
from pathlib import Path
import hashlib
try:
import blake3
def h(data: bytes) -> bytes: return blake3.blake3(data).digest()
except Exception:
def h(data: bytes) -> bytes: return hashlib.blake2b(data, digest_size=32).digest()
def fetch(path_or_url: str) -> bytes:
# Works for local paths; for HTTP(S), replace with requests.get(...).content
p = Path(path_or_url)
return p.read_bytes()
def hash_cat(a: bytes, b: bytes) -> bytes: return h(a + b)
def verify_merkle_path(leaf_hash: bytes, leaf_index: int, siblings_hex: list[str]) -> bytes:
hval = leaf_hash
idx = leaf_index
for sib_hex in siblings_hex:
sib = bytes.fromhex(sib_hex)
hval = hash_cat(hval, sib) if idx % 2 == 0 else hash_cat(sib, hval)
idx //= 2
return hval # computed root
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--data-root", required=True, help="Local dir or base URL where files are published")
ap.add_argument("--record-index", type=int, required=True, help="0-based index of the uint64 record to verify")
args = ap.parse_args()
# Load manifest & commitment
manifest = json.loads(fetch(os.path.join(args.data_root, "manifest.json")).decode())
commit = json.loads(fetch(os.path.join(args.data_root, "commitment.json")).decode())
chunk_bytes = int(manifest["chunk_bytes"])
rec_bytes = 8
# Compute byte offset and chunk index
byte_off = args.record_index * rec_bytes
chunk_index = byte_off // chunk_bytes
offset_in_chunk = byte_off % chunk_bytes
# Load the chunk bytes (exact 8 MiB except possibly last chunk)
# For local dir: read from shards by seeking; for simplicity, use a pre-sliced chunk file:
chunk_path = os.path.join(args.data_root, f"chunks/chunk_{chunk_index:012d}.bin")
chunk = fetch(chunk_path)
# Hash the chunk and fetch its proof
leaf_hash = h(chunk)
proof = json.loads(fetch(os.path.join(args.data_root, f"proofs/chunk_{chunk_index:012d}.json")).decode())
if proof["leaf_index"] != chunk_index:
print("Proof index mismatch"); sys.exit(2)
# Rebuild Merkle root from proof and compare
root = verify_merkle_path(leaf_hash, chunk_index, proof["siblings"])
if root.hex() != proof["root"] or root.hex() != commit["root_hex"]:
print("Merkle root mismatch — proof or data is invalid"); sys.exit(3)
# Within the chunk, read the 8-byte record and show it
if offset_in_chunk + rec_bytes > len(chunk):
print("Record offset out of chunk bounds"); sys.exit(4)
rec = chunk[offset_in_chunk: offset_in_chunk + rec_bytes]
val = int.from_bytes(rec, byteorder="little") # dataset writes uint64 little-endian
print("OK: record is present.")
print("record_index:", args.record_index)
print("uint64:", val)
print("bitstring (MSB->LSB, 63 bits):", format(val, "064b")[1:]) # top bit unused
if __name__ == "__main__":
main()
Pro tip You can host chunks/ and proofs/ alongside the main files so users never touch full shards.
If you don’t publish per-chunk files, the same script works on local data by slicing directly from shards.
B) Full re-hash of the dataset (slow)
This recomputes every 8 MiB leaf hash from the raw shards, rebuilds the Merkle tree, and compares the root to
commitment.json. It’s the gold standard, but it requires all bytes to be local.
Python ≥3.10
# verify_all.py
import json, os, hashlib
from pathlib import Path
try:
import blake3
def h(data: bytes) -> bytes: return blake3.blake3(data).digest()
except Exception:
def h(data: bytes) -> bytes: return hashlib.blake2b(data, digest_size=32).digest()
def hash_cat(a: bytes, b: bytes) -> bytes: return h(a + b)
def rebuild_root(data_dir: Path) -> str:
m = json.loads((data_dir / "manifest.json").read_text())
chunk_bytes = int(m["chunk_bytes"])
# Stream leaves
leaf_hashes: list[bytes] = []
for sh in m["shards"]:
p = data_dir / sh["path"]
size = p.stat().st_size
with open(p, "rb") as fin:
remaining = size
while remaining > 0:
to_read = min(chunk_bytes, remaining)
block = fin.read(to_read)
if len(block) != to_read:
raise IOError("short read on shard")
leaf_hashes.append(h(block))
remaining -= to_read
# Build Merkle
level = leaf_hashes
while len(level) > 1:
nxt = []
it = iter(level)
for a in it:
b = next(it, None)
if b is None: b = a
nxt.append(hash_cat(a, b))
level = nxt
return level[0].hex()
if __name__ == "__main__":
import argparse
ap = argparse.ArgumentParser()
ap.add_argument("data_dir", help="directory containing shards + manifest.json + commitment.json")
args = ap.parse_args()
data_dir = Path(args.data_dir)
root_rebuilt = rebuild_root(data_dir)
root_claimed = json.loads((data_dir / "commitment.json").read_text())["root_hex"]
print("rebuilt_root:", root_rebuilt)
print("claimed_root:", root_claimed)
print("MATCH" if root_rebuilt == root_claimed else "MISMATCH")
Finding a specific bracket
Each bracket is a single uint64. The top bit is unused; the next 63 bits encode the winners (MSB first).
If you know a record’s index N, its byte offset is offset = N × 8. With 8 MiB leaves:
chunk_index = offset // 8,388,608 and offset_in_chunk = offset % 8,388,608.
To decode bits into rounds/regions in Python, you can reuse the helper in the generator or treat the 63-bit string directly.
How I’ll publish proofs
For convenience, I’ll expose proofs/chunk_{'{' }index:012d{ '}' }.json and chunks/chunk_{'{' }index:012d{ '}' }.bin.
Each proof JSON has:
{
"leaf_index": 123456,
"leaf_hash": "…", // optional
"siblings": ["hex", …], // bottom → top
"root": "ROOT_HEX"
}
If you’re running locally, you can also generate a proof with the included MerkleBuilder.prove_chunk().