mirror of https://github.com/maderix/ANE.git
92 lines
2.8 KiB
Bash
Executable File
92 lines
2.8 KiB
Bash
Executable File
#!/bin/bash
|
|
# Download pretokenized TinyStories data for ANE training
|
|
# Format: flat uint16 token IDs (Llama2 BPE, 32K vocab)
|
|
# Source: enio/TinyStories on HuggingFace (pretokenized with karpathy/llama2.c)
|
|
#
|
|
# The tar.gz contains data00.bin..data49.bin (50 shards).
|
|
# We extract only data00.bin and rename it to tinystories_data00.bin.
|
|
|
|
set -e
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
|
OUTPUT="$SCRIPT_DIR/tinystories_data00.bin"
|
|
|
|
if [ -f "$OUTPUT" ]; then
|
|
SIZE=$(stat -f%z "$OUTPUT" 2>/dev/null || stat -c%s "$OUTPUT" 2>/dev/null)
|
|
TOKENS=$((SIZE / 2))
|
|
echo "$OUTPUT already exists ($TOKENS tokens, $(echo "scale=1; $SIZE/1000000" | bc) MB)"
|
|
exit 0
|
|
fi
|
|
|
|
TAR_URL="https://huggingface.co/datasets/enio/TinyStories/resolve/main/tok32000/TinyStories_tok32000.tar.gz?download=true"
|
|
TAR_FILE="$SCRIPT_DIR/TinyStories_tok32000.tar.gz"
|
|
|
|
echo "=== TinyStories Data Download ==="
|
|
echo "Downloading pretokenized TinyStories (32K vocab, ~993 MB)..."
|
|
echo " Source: enio/TinyStories on HuggingFace"
|
|
echo " This will take a few minutes depending on your connection."
|
|
echo ""
|
|
|
|
# Download the tar.gz
|
|
if [ ! -f "$TAR_FILE" ]; then
|
|
if command -v curl &>/dev/null; then
|
|
curl -L --progress-bar -o "$TAR_FILE" "$TAR_URL"
|
|
elif command -v wget &>/dev/null; then
|
|
wget --show-progress -O "$TAR_FILE" "$TAR_URL"
|
|
else
|
|
echo "Error: need curl or wget"
|
|
exit 1
|
|
fi
|
|
else
|
|
echo "Tar file already downloaded, skipping..."
|
|
fi
|
|
|
|
# Verify it's actually a gzip file (not an error page)
|
|
if ! file "$TAR_FILE" | grep -q "gzip"; then
|
|
echo "Error: Downloaded file is not a valid gzip archive."
|
|
echo "Content: $(head -c 100 "$TAR_FILE")"
|
|
rm -f "$TAR_FILE"
|
|
exit 1
|
|
fi
|
|
|
|
echo ""
|
|
echo "Extracting data00.bin from archive..."
|
|
|
|
# List what's in the archive to find the right path
|
|
DATA_FILE=$(tar tzf "$TAR_FILE" 2>/dev/null | grep 'data00\.bin' | head -1)
|
|
if [ -z "$DATA_FILE" ]; then
|
|
echo "Error: data00.bin not found in archive. Contents:"
|
|
tar tzf "$TAR_FILE" | head -20
|
|
exit 1
|
|
fi
|
|
echo " Found: $DATA_FILE"
|
|
|
|
# Extract just data00.bin
|
|
tar xzf "$TAR_FILE" -C "$SCRIPT_DIR" "$DATA_FILE"
|
|
|
|
# Move to expected location (might be in a subdirectory)
|
|
EXTRACTED="$SCRIPT_DIR/$DATA_FILE"
|
|
if [ "$EXTRACTED" != "$OUTPUT" ]; then
|
|
mv "$EXTRACTED" "$OUTPUT"
|
|
# Clean up any extracted subdirectories
|
|
rmdir "$(dirname "$EXTRACTED")" 2>/dev/null || true
|
|
fi
|
|
|
|
# Clean up tar.gz to save disk space
|
|
echo "Cleaning up archive..."
|
|
rm -f "$TAR_FILE"
|
|
|
|
SIZE=$(stat -f%z "$OUTPUT" 2>/dev/null || stat -c%s "$OUTPUT" 2>/dev/null)
|
|
TOKENS=$((SIZE / 2))
|
|
echo ""
|
|
echo "Done: $OUTPUT"
|
|
echo " $TOKENS tokens ($(echo "scale=1; $SIZE/1000000" | bc) MB)"
|
|
|
|
# Sanity check
|
|
python3 -c "
|
|
import struct
|
|
with open('$OUTPUT', 'rb') as f:
|
|
tokens = struct.unpack('<10H', f.read(20))
|
|
print(f'First 10 tokens: {tokens}')
|
|
" 2>/dev/null || true
|