diff --git a/.gitignore b/.gitignore index 144e950..65f4611 100644 --- a/.gitignore +++ b/.gitignore @@ -12,4 +12,4 @@ data/20221219.json config/hashtag-urls.txt config/urls.txt data/20221219.uris.txt -./data/* \ No newline at end of file +data/* \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index f0a641c..fa6c194 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,7 +1,7 @@ version: "3.9 " services: - fake-firehose: + firehose-archive: build: . image: fakefirehose:latest volumes: diff --git a/scripts/start-firehose.sh b/scripts/start-firehose.sh index 2125aab..50e76a9 100644 --- a/scripts/start-firehose.sh +++ b/scripts/start-firehose.sh @@ -82,7 +82,7 @@ cat /config/hashtag-urls.txt >> /config/urls.txt cat /config/urls.txt | while read -r url do echo "Opening $url to stream" - sleep 1s + sleep 0.1s ./stream-url.sh $url & done diff --git a/scripts/stream-url.sh b/scripts/stream-url.sh index ba34c5a..68f0ada 100644 --- a/scripts/stream-url.sh +++ b/scripts/stream-url.sh @@ -12,26 +12,30 @@ do today=`date +"%Y%m%d"` echo "Starting to stream $url in 5 seconds" + echo "Archive status is $archive" sleep 5s; - curl -X "GET" "$url" \ - --no-progress-meter | \ - tee -a "/data/$today.json" | \ - grep url | \ - sed 's/data://g' | \ + if [[ $archive != "true" ]] + then + curl -X "GET" "$url" \ + --no-progress-meter | \ + tee -a "/data/$today.json" | \ + grep url | \ + sed 's/data://g' | \ - while read -r line - do + while read -r line + do + if [[ $line == *"uri"* ]] + then + url=`echo $line | jq .url| sed 's/\"//g'` + uri=`echo $line | jq .uri| sed 's/\"//g'` - if [[ $line == *"uri"* ]] - then - url=`echo $line | jq .url| sed 's/\"//g'` - uri=`echo $line | jq .uri| sed 's/\"//g'` - - echo "STREAMING from $host $url" - echo $uri >> "/data/$today.uris.txt" - - fi - done + echo "STREAMING from $host $url" + echo $uri >> "/data/$today.uris.txt" + fi + done + else + curl -X "GET" "$url" --no-progress-meter >> "/data/$today.$host.json" + fi done \ No newline at end of file