ANE/training/download_data.sh

#!/bin/bash
# Download pretokenized TinyStories data for ANE training
# Format: flat uint16 token IDs (Llama2 BPE, 32K vocab)
# Source: enio/TinyStories on HuggingFace (pretokenized with karpathy/llama2.c)
#
# The tar.gz contains data00.bin..data49.bin (50 shards).
# We extract only data00.bin and rename it to tinystories_data00.bin.

set -e

SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
OUTPUT="$SCRIPT_DIR/tinystories_data00.bin"

if [ -f "$OUTPUT" ]; then
    SIZE=$(stat -f%z "$OUTPUT" 2>/dev/null || stat -c%s "$OUTPUT" 2>/dev/null)
    TOKENS=$((SIZE / 2))
    echo "$OUTPUT already exists ($TOKENS tokens, $(echo "scale=1; $SIZE/1000000" | bc) MB)"
    exit 0
fi

TAR_URL="https://huggingface.co/datasets/enio/TinyStories/resolve/main/tok32000/TinyStories_tok32000.tar.gz?download=true"
TAR_FILE="$SCRIPT_DIR/TinyStories_tok32000.tar.gz"

echo "=== TinyStories Data Download ==="
echo "Downloading pretokenized TinyStories (32K vocab, ~993 MB)..."
echo "  Source: enio/TinyStories on HuggingFace"
echo "  This will take a few minutes depending on your connection."
echo ""

# Download the tar.gz
if [ ! -f "$TAR_FILE" ]; then
    if command -v curl &>/dev/null; then
        curl -L --progress-bar -o "$TAR_FILE" "$TAR_URL"
    elif command -v wget &>/dev/null; then
        wget --show-progress -O "$TAR_FILE" "$TAR_URL"
    else
        echo "Error: need curl or wget"
        exit 1
    fi
else
    echo "Tar file already downloaded, skipping..."
fi

# Verify it's actually a gzip file (not an error page)
if ! file "$TAR_FILE" | grep -q "gzip"; then
    echo "Error: Downloaded file is not a valid gzip archive."
    echo "Content: $(head -c 100 "$TAR_FILE")"
    rm -f "$TAR_FILE"
    exit 1
fi

echo ""
echo "Extracting data00.bin from archive..."

# List what's in the archive to find the right path
DATA_FILE=$(tar tzf "$TAR_FILE" 2>/dev/null | grep 'data00\.bin' | head -1)
if [ -z "$DATA_FILE" ]; then
    echo "Error: data00.bin not found in archive. Contents:"
    tar tzf "$TAR_FILE" | head -20
    exit 1
fi
echo "  Found: $DATA_FILE"

# Extract just data00.bin
tar xzf "$TAR_FILE" -C "$SCRIPT_DIR" "$DATA_FILE"

# Move to expected location (might be in a subdirectory)
EXTRACTED="$SCRIPT_DIR/$DATA_FILE"
if [ "$EXTRACTED" != "$OUTPUT" ]; then
    mv "$EXTRACTED" "$OUTPUT"
    # Clean up any extracted subdirectories
    rmdir "$(dirname "$EXTRACTED")" 2>/dev/null || true
fi

# Clean up tar.gz to save disk space
echo "Cleaning up archive..."
rm -f "$TAR_FILE"

SIZE=$(stat -f%z "$OUTPUT" 2>/dev/null || stat -c%s "$OUTPUT" 2>/dev/null)
TOKENS=$((SIZE / 2))
echo ""
echo "Done: $OUTPUT"
echo "  $TOKENS tokens ($(echo "scale=1; $SIZE/1000000" | bc) MB)"

# Sanity check
python3 -c "
import struct
with open('$OUTPUT', 'rb') as f:
    tokens = struct.unpack('<10H', f.read(20))
    print(f'First 10 tokens: {tokens}')
" 2>/dev/null || true