fake-firehose/scripts/stream-url.sh

82 lines
2.0 KiB
Bash

url=$1 #A proper URL is all that should be sent to this script
host=$2
errors=0
if [[ "$url" == "" ]]
then
echo "[WARN] Empty url, skipping" # Exit if an empty URL was sent
exit 2
fi
# Check to see if domain name resolves. If not, exist
if [[ ! `dig $host +short` ]]
then
echo "[WARN] DNS Lookup failed for $host, skipping"
fi
echo "[INFO] Archive is $archive"
while true # Loop endlessly
do
today=`date +"%Y%m%d"`
echo "[INFO] Starting to stream $url in 5 seconds"
echo "[INFO] Archive status is $archive"
sleep 5s;
# Im archive mode we'll only fetch the json stream to save resources from jq and sed
if [[ $archive != "true" ]]
then
#Not in archive mode
curl -X "GET" "$url" \
--no-progress-meter | \
tee -a "/data/$today.json" | \
grep url | \
sed 's/data://g' | \
while read -r line
do
if [[ $line == *"uri"* ]]
then
url=`echo $line | jq .url| sed 's/\"//g'`
uri=`echo $line | jq .uri| sed 's/\"//g'`
echo "[INFO] Posting $url from $host"
echo $uri >> "/data/$today.uris.txt"
fi
done
# In archive mode
else
if [[ ! -d "/data/$today/" ]]
then
mkdir -p "/data/$today/"
fi
curl -X "GET" "$url" --no-progress-meter >> "/data/$today/$today.$host.json"
fi
# Basic exponential backoff
((++errors))
sleepseconds=$((errors*errors))
# Don't allow a back off for more than 5 minutes.
# Because we expect this container to reset occasionally to kill hanging curl processes
# a graceful exit will wait for all scripts to stop. So, it will take at least as long as $sleepseconds
# to stop.
if [[ $sleepseconds -gt 299 ]]
then
sleepseconds=300
fi
sleep $sleepseconds;
echo "[WARN] Streaming abrubtly stopped for $host, streaming will pause for $sleepseconds seconds before retrying."
done
## Exit 0 by default
exit 0