Compare commits

...

29 Commits

Author SHA1 Message Date
raynormast a4614e886b
Merge pull request #5 from bolstad/deduplicte-endpoint-urls
Deduplicte endpoint urls
2023-03-23 21:13:53 -04:00
Christian Bolstad 237840aa33 Do not process hostnames that starts with a single # (in case someone accidently used the incorrect format to comment out lines 2023-01-14 13:52:53 +01:00
Christian Bolstad 237ed1303d Make sure that we only process the same url once 2023-01-14 13:51:31 +01:00
Christian Bolstad e66b06503d always reset hashtag-urls.txt 2023-01-14 13:50:32 +01:00
raynor d466f59781 Iniitial commit of misc scripts; needs debugging 2023-01-02 09:23:00 -05:00
Raynor dcc2a551d2 Added sudos 2022-12-28 01:28:48 +00:00
Raynor 1ccee8638d Fixed typo 2022-12-28 01:27:21 +00:00
Raynor c33b08165f Cleaned gitignore 2022-12-27 23:34:46 +00:00
Raynor 2f35a3cad6 Updated todo list 2022-12-27 23:34:10 +00:00
Raynor 022ac7750f Cleaned up gitignore 2022-12-27 23:34:02 +00:00
Raynor e4f1ac5688 Updated documentation 2022-12-27 23:27:50 +00:00
Raynor 046839eba5 Cleaned up some unneeded files. 2022-12-27 22:20:53 +00:00
Raynor 2cc0d91ca4 Fixed bug in how hashtags are found 2022-12-26 17:50:05 +00:00
Raynor 1f9d8adc14 Clarified logging output language 2022-12-26 17:42:01 +00:00
Raynor 4c316296a5 Better documentation on minURIs 2022-12-26 17:31:53 +00:00
Raynor 29eb750165 Allow for variable stream delay to debug 2022-12-26 17:25:37 +00:00
Raynor 64c2abf173 Updated logging output 2022-12-26 17:20:07 +00:00
Raynor 1d7a2faf0b Restart time is not configurable 2022-12-22 14:10:36 +00:00
Raynor d2d7f4f621 Install some other helpful internet utils 2022-12-22 14:07:31 +00:00
Raynor 86d97397df Upped restart time, output now more helpful 2022-12-22 14:07:09 +00:00
Raynor 9d3f7f4873 Added archive mode and some basic health checks 2022-12-22 14:06:43 +00:00
Raynor 543f896d52 Clarified debug output 2022-12-22 14:06:19 +00:00
Raynor bd71dc16b4 Changed restart time; added to todo 2022-12-21 03:37:10 +00:00
Raynor 3a5a2c34e3 Change how archive directory is structured; create folder if needed 2022-12-21 03:36:31 +00:00
Raynor d3accb4d65 Added a basic backoff if in archive mode 2022-12-21 03:29:28 +00:00
Raynor 2acfdf264f Add archive mode where JSON is saved only 2022-12-21 03:21:55 +00:00
Raynor f561bb4521 git 2022-12-21 03:20:48 +00:00
raynormast e877becdb1
Merge pull request #3 from raynormast/dev
Fixed URL
2022-12-18 19:56:56 -05:00
raynor eefa547a13 Fixed URL 2022-12-18 19:56:13 -05:00
15 changed files with 570 additions and 137 deletions

View File

@ -1,27 +0,0 @@
// For format details, see https://aka.ms/devcontainer.json. For config options, see the
// README at: https://github.com/devcontainers/templates/tree/main/src/ubuntu
{
"name": "Ubuntu",
// Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
"image": "mcr.microsoft.com/devcontainers/base:focal",
"features": {
"ghcr.io/devcontainers/features/git:1": {},
"ghcr.io/devcontainers/features/github-cli:1": {},
"ghcr.io/eitsupi/devcontainer-features/jq-likes:1": {}
}
// Features to add to the dev container. More info: https://containers.dev/features.
// "features": {},
// Use 'forwardPorts' to make a list of ports inside the container available locally.
// "forwardPorts": [],
// Use 'postCreateCommand' to run commands after the container is created.
// "postCreateCommand": "uname -a",
// Configure tool-specific properties.
// "customizations": {},
// Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root.
// "remoteUser": "root"
}

View File

@ -1,12 +1,34 @@
fakeRelayKey="YOUR--FAKE---RELAY---KEY"
fakeRelayHost="https://your-fake-relay-url.YourPetMastodon.com"
## Do you want to send URIs to fake relay or just log them?
## Set to false if you don't want to send URIs to your fakerelay. Generally this is only used for debugging
runFirehose=true
# Maximum number of curls processes to run at once
maxCurls=50
## Maximum number of curl instances to be allowed to run. This is only used
## if you send data to the relay
maxCurls=500
# Minimum number of URIs to have before you process a batch.
# Don't put this too low or you send over lots of duplicates and burn up your machine
minURIs=100
## Minimum number of posts to queue up before sending the to the relay.
## This is more useful when you are streaming federated timelines from larger instances
## because you will have a lot of duplicate URIs, wasting resources on your machine.
## A URI batch is de-duplicated before being sent to the relay.
##
## On the other hand, if you are not following timelines that generate a lot of posts
## a smaller value is appropriate.
minURIs=100
## Archive mode will save the json stream but not parse it, not even into URIs
## This will greatly save resources, but obviously will not send it to
## the relay.
##
## Generally only used for debugging or archiving instance streams
archive=false
## Restart timeout
## Put whatever amount of time here you want the container to restart after
## This will kill any hanging curl processes that may be taking up uncessary
## resources
restartTimeout=1h
## How long to delay starting a stream. Leave it here unless you are debugging
streamDelay="0.1s"

8
.gitignore vendored
View File

@ -1,14 +1,10 @@
.DS_Store
data/20221217.json
20221217.uris.txt
.env.production
config/domains-federated
config/domains-local
config/hashtags
config/hosts
.gitignore
data/20221219.json
.gitignore
config/hashtag-urls.txt
config/urls.txt
data/20221219.uris.txt
log.txt

View File

@ -5,7 +5,9 @@ RUN apt install -y \
jq \
curl \
nano \
vim
vim \
dnsutils \
wget
COPY ./scripts /scripts
WORKDIR /scripts

View File

@ -1,3 +1,5 @@
1. Create language filter
2. Create bot filter
3.
3. Add health check
4. Use smaller docker base, probably alpine
5. Create and set non-root user in dockerfile.

11
docker-compose.sample.yml Normal file
View File

@ -0,0 +1,11 @@
version: "3.9 "
services:
firehose-archive:
build: .
image: fakefirehose:latest
volumes:
- ../data:/data
- ./config:/config
restart: always
env_file: .env.production

View File

@ -1,11 +1,11 @@
version: "3.9 "
services:
fake-firehose:
firehose-archive:
build: .
image: fakefirehose:latest
volumes:
- ./data:/data
- ../data:/data
- ./config:/config
restart: always
env_file: .env.production

272
readme.md
View File

@ -1,10 +1,9 @@
# Fake Firehose
This project generates the mythical "firehose" relay that small Mastodon instances look for,
at least to get content.
This project is basically a shell/bash/text frontend for [fakerelay](https://github.com/g3rv4/FakeRelay)
It's a little crazy.
It allows instances to fill their federated timelines from other instances that have public timelines.
Find a better way to do it and issue a pull request, or just tell me where your new repo is :)
You can find the fakefirehose author at [@raynor@raynor.haus](https://raynor.haus/@raynor)
## How to run it
@ -14,9 +13,9 @@ In the config folder there are three files
- domains-local
- hashtags
If you want the full on public feed from an instance, put it in the domains-federated file, one domain per line
If you want the full on public feed from an instance, put it in the domains-federated file, one domain per line.
If you only want the local feed from an instance, put it on the domains-local file, one domain per line
If you only want the local feed from an instance, put it on the domains-local file, one domain per line.
If you want to follow a hash tag you either either add a hashtag after an instance in `domains-federated` or `domains-local`
@ -26,14 +25,101 @@ stream from mastodon.social
Another example: if in `domains-local` you put `infosec.exchange #hacker` a stream will open to watch for the hashtag #hacker on the _local_ stream from infosec.exchange
## Docker
Build docker
To run it in docker -- recommended
Run docker
1. Make sure you have [docker installed](https://docs.docker.com/engine/install/).
2. From your shell: create a directory, it is recommended that you give it a relevant name
3. Go into that directory and use `git clone https://github.com/raynormast/fake-firehose.git`
4. Go into the created directory: `cd fake-firehose`
5. `sudo docker build -t fakefirehose .`
6. Edit your `docker-compose.yml` file as needed. **The biggest thing** is to watch the volumes. It is _highly_ recommended that you keep your data directory in the parent directory, and NOT the directory the git repo is in.
7. Edit your `.env.production` file. The file is fairly well commented.
8. Run `sudo docker compose -f docker-compose.yml`
### The hashtags file
If you put ANY hashtags in here a stream will be opened for _every_ host in the `domains-federated` and `domains-local` file.
The entire thing should look something like:
```
cd ~
mkdir MastodonFireHose
cd MastodonFirehose
git pull https://github.com/raynormast/fake-firehose.git
cd fake-firehose
docker build -t fakefirehose .
# Edit your docker-compose and .env.production here
sudo docker compose -f docker-compose.yml up -d
```
# Configuration
## tl;dr
Your `./config` folder has three sample files, after editing you should have the following three files:
```
domains-federated
domains-local
hashtags
```
**In each file, comments begin with `##` not the tradional single `#`.**
The syntax is the same for the domains files:
```
## Follow full timeline
mastodon.instance
## Follow these hashtags from the timeline
mastodon.instance #mastodon #relay
```
The files are well commented.
## domains-federated file
This file has the full federated feeds of any instances you want fed to fakerelay.
Each line of the file should have the domain name of an instance whose federated timeline you want to follow.
I.e.,
```
raynor.haus
infosec.exchange
```
This can generate a LOT of posts if you choose a large instance.
For example, if you use `mastodon.social` or `mas.to` you can expect your server to fall behind. `mastodon.social` generates 50,000 - 200,000 posts on the federated timeline per day.
It is recommended that you only use this file to:
- follow hashtags
- follow instances with small federated timelines, with content you want in yours
#### domains-federated hashtags
The one time to use the federated timeline is to catch most posts with a specific hashtag.
Every word after after an instance domain is a hashtag to relay.
Example:
`mastodon.social fediblock fediverse mastodev mastoadmin`
Will only return posts from the mastodon.social federated feed with hashtags of `#fediblock`, `#fediverse`,
`#mastodev`, and `#mastoadmin`.
The `#` is optional -- it is accepted simply to make the file more intuitive.
## domains-local file
This file is identical to the `domains-federated` file except that it only recieves posts created on
_that_ instance (the local timeline).
It is possible to keep up with the larger instances, such as `mastodon.social` if you only look at the
local timeline.
## hashtags file
If you put ANY hashtags in here a stream will be opened for _every_ host in the `domains-federated` and `domains-local` file.
**It's purpose is for people or instances that want to find nearly every post with a particular hashtag**
_It can very quickly open up a lot of `curl` streams_
### Example
`domains-federated` content:
```
@ -56,6 +142,10 @@ Mastodon
will result in the following streams all opening:
```shell
https://mastodon.social/api/v1/streaming/public
https://mas.to/api/v1/streaming/public
https://aus.social/api/v1/streaming/public/local
https://mastodon.nz/api/v1/streaming/public/local
https://mastodon.social/api/v1/streaming/hashtag?tag=JohnMastodon
https://mas.to/api/v1/streaming/hashtag?tag=JohnMastodon
https://aus.social/api/v1/streaming/hashtag?tag=JohnMastodon
@ -67,6 +157,164 @@ https://mastodon.nz/api/v1/streaming/hashtag?tag=Mastodon
```
If you had a total of 5 lines in `domains-federated` and `domains-local` plus 3 entries in `hashtags`
there would 5x5x3 = 75 new streams.
there would 5 x 5 x 3 = 75 new streams.
I mean, you can do it, but you won't need your central heating system any more.
Usually a more targeted approach is better.
It is recommended that you put hashtags in your `domains-federated` or `domains-local` files.
Your humble author's federated file currently looks like this:
```
mastodon.social infosec hacker hackers osint hive lockbit hackgroup apt vicesociety
mastodon.social blackmastodon blackfediverse poc actuallyautistic neurodivergent blacklivesmatter freechina antiracist neurodiversity blackhistory bipoc aapi asian asianamerican pacificislander indigenous native
mastodon.social fediblock fediverse mastodev mastoadmin
mastodon.social apple politics vegan trailrunning church churchillfellowship christianity christiannationalism
```
My `domains-local` file is:
```
## Fake Firehose will only take local posts from these domains
mastodon.social
universeodon.com
## International English (if you aren't from the US) ###
## mastodon.scot
aus.social
mastodon.nz
respublicae.eu
mastodon.au
### Tech ###
partyon.xyz
infosec.exchange
ioc.exchange
tech.lgbt
techhub.social
fosstodon.org
appdot.net
social.linux.pizza
journa.host
climatejustice.social
```
This generates an acceptable stream of posts for my federated timeline. The tags I follow on mastodon.social
are those that are either few in number overall, or are harder to find on local timelines.
## .env.production
tl;dir, This file is fairly well commented internally, just go at it.
**The sample file probably does not need any changes beyond your fakerelay information**
### options
#### fakeRelayKey
This needs to have the key you generated with fakerelay.
_Example_:
`fakeRelayKey="MrNtYH+GjwDtJtR6YCx2O4dfasdf2349QtZaVni0rsbDryETCx9lHSZmzcOAv3Y8+4LiD8bFUZbnyl4w=="`
#### fakeRelayHost
The full URL to your fakerelay
_Example_:
fakeRelayHost="https://fr-relay-post.myinstance.social/index"
#### runFirehose
This controls whether the posts will actually be sent to your relay, or only collected in your /data folder.
You almost certainly want this set at:
`runFirehose=true`
The _only_ reason to set it to `false` is for debugging, or logging posts from the fediverse.
#### maxCurls and minURIs
These two options are closely related. `maxCurls` is the maximum number of `curl` processes you want to have
running on your system at once. If you follow timelines with a lot of posts, you may need to limit this.
**Note** This always needs to be higher that the total number of instances + hashtags you have configured, because each one of those is a separate `curl` process
fake-firehose batches posts to de-duplicate them, `minURIs` is the size of that batch. If you have a lot of
_federated_ posts coming in you will want to set this to a high number because a lot of them will be duplicates.
If you only use local timelines it doesn't matter, you will not have any duplicates.
It is a tradeoff between resources (and `curl` processes running) and how quickly you want to fill your
instance's federated timeline.
_Example for a moderate number of incoming posts_:
```
## Max curl processes have not gotten out of control so this is absurdely high.
maxCurls=2000
## Nearly all of the timelines I follow are local, so there are very few duplicates.
minURIs=10
```
#### archive
Archive mode will save the json stream but not parse it, not even into URIs.
This will greatly save resources, but obviously will not send it to
the relay.
**The only reasons to use this is for debugging or logging posts from servers**.
You almost certainly want this set at:
```archive=false```
#### restartTimeout
This is how long the docker image will run before exiting. As long as your `docker-compose` has `restart: always` set this simply restarts the image to kill any hung `curl` processes.
The only reason to set it high is if you have a lot of timelines you follow. Each one takes time to open up,
so if you restart often you will miss more posts.
_Example:_
`restartTimeout=4h`
#### streamDelay
This is only for debugging.
Keep it at:
`streamDelay="0.1s"`
# Data Directory
Data is saved in the format of:
```
"%Y%m%d".uris.txt
```
In archive mode the format is:
```
"/data/"%Y%m%d"/"%Y%m%d".$host.json"
```
For example, if you set `archive=true` and had `mastodon.social` in your `domains-federated` or `domains-local` config, on January 1st, 2023 the json stream would be saved at
```
/data/20230101.mastodon.social.json
```
# Misc
## Backoff
An exponential backoff starts if `curl` fails. It is rudimentary and maxes out at 15 minutes.
## DNS lookup
Before a URL starts streaming fakefirehose will look up the DNS entry of the host. If it fails,
the stream will not begin, _and will not attempt to begin again_ until the container is restarted.
## Permissions
The permissions of the outputted data files will be set to `root` by default. This will get fixed
in a future release.
# Why fake firehose?
When I wrote this there were not other options I was aware of to fill a federated timeline of a small instance.
The work of [Gervasio Marchand](https://mastodonte.tech/@g3rv4) is fantastic but still required programming knowledge to make use of.
I wanted the simplest setup and config I could create, without setting up an entirely new web UI.
There are a lot of things to do better, I'll work on the ones I have time and capability for. Otherwise, this project
is practically begging to be re-written in python or something else.

View File

@ -1,46 +0,0 @@
host=$1
type=$2
hashtag=$1
if [[ "$host" == "" ]]
then
echo "Empty host: $host"
exit 2
fi
while true
do
today=`date +"%Y%m%d"`
case "$type" in
"federated")
fetch="https://$host/api/v1/streaming/public";;
"local")
fetch="https://$host/api/v1/streaming/public?local=true";;
esac
echo "Starting to stream $fetch in 5 seconds"
sleep 5s;
curl -X "GET" "$fetch" \
--no-progress-meter | \
tee -a "/data/$today.json" | \
grep url | \
sed 's/data://g' | \
while read -r line
do
if [[ $line == *"uri"* ]]
then
url=`echo $line | jq .url| sed 's/\"//g'`
uri=`echo $line | jq .uri| sed 's/\"//g'`
echo "STREAMING: $host $url"
echo $uri >> "/data/$today.uris.txt"
fi
done
done

View File

@ -0,0 +1,19 @@
############################################################################
##
## This script exports URIs from a saved JSON stream. It uses the same logic
## as stream-url.sh, except that it reads the JSON from a file.
## It takes one argument, the input file name.
##
############################################################################
source=$1
cat "$source"|grep -A 1 "event: update"|grep "data:" | \
while read -r line
do
if [[ $line == *"uri"* ]]
then
uri=`echo $line | sed 's/data: //g' | jq .uri| sed 's/\"//g'`
echo "$uri"
fi
done

View File

@ -0,0 +1,83 @@
############################################################################
##
## This script sends URIs to fakerelay based on a saved JSON stream.
## It takes one argument, the input file name.
##
############################################################################
## Look for environmental variables. Because this script may be run outside of docker
## there is a good change that they are not set, if they are not, attempt to set them
## via the .env.production file. If that fails warn and keep going
if [[ ! $loadEnv && -f ../../.env.production ]]
then
echo "[INFO] Did not detect that environmental variables are set, attempting to set via ../../.env.production"
source ../../.env.production
fi
if [[ ! $loadEnv ]]
then
echo "[WARN] Cannot find environemtnal variables, expect things to break ahead"
sleep 5s
fi
today=`date +"%Y%m%d"`
## The source file we are reading from
source=$1
while true
do
if [[ -f "./maxcurls" ]]
then
maxCurls=`cat ./maxcurls`
fi
## Here we take the top 500 lines of the file -- so we are in FIFO
## and pipe them thru uniq so we only pass unique URIs through to the fake relay
## This step easily cuts the total number of URIs in half and is the only way we can keep up
## Make sure that you have the name number in the following two lines. In this repo, it is currently at 500
seed=`date +%Y%M%d%H%M%S%N`
backfillFile="backfilluris.$seed.txt"
sedExpression="1,${minURIs}d"
sed -i $sedExpression "$source"
head "$source" -n $minURIs | sort | uniq -u > "$backfillFile"
## Start looping through the unique URIs
cat "$backfillFile" | \
while read -r line
do
if [[ "$line" != "" ]]
then
uri=`echo $line | sed 's/data: //g' | jq .uri| sed 's/\"//g'`
echo "[INFO] RUN-FIREHOSE: Posting $uri"
## Send it to the fake relay as a background job
curl -X "POST" "$fakeRelayHost" \
-H "Authorization: Bearer $fakeRelayKey" \
-H 'Content-Type: application/x-www-form-urlencoded; charset=utf-8' \
--data-urlencode "statusUrl=$uri" \
--no-progress-meter &
## Don't overload the system on open curls. Wait until they are below a certain amount to move on
## Or have some fun, set this as high as you like and turn your computer into a space heater!
curls=`ps -ef|grep curl|wc -l`
until [ $curls -lt $maxCurls ]
do
curls=`ps -ef|grep curl|wc -l`
echo "[INFO] RUN-FIREHOSE: Waiting for existing curls to finish, at $curls"
linesLeft=`cat "$source"|wc -l`
echo "[INFO] RUN-FIREHOSE:$linesLeft Total URIs left"
sleep 5s
done
fi
done
linesLeft=`cat "$source"|wc -l`
echo "\n \n LINES LEFT: $linesLeft \n\n"
rm "$backfillFile"
done

View File

@ -0,0 +1,73 @@
############################################################################
##
## This script sends URIs to fakerelay based on a saved file of URIS, one
## URI per line. It takes on argument, the filename with the URIs
##
## The significant difference is that the JSON stream has already been processed
## so this script can post the URIs much faster, as it doesn't have to run the
## JSON stream through jq
##
############################################################################
## Look for environmental variables. Because this script may be run outside of docker
## there is a good change that they are not set, if they are not, attempt to set them
## via the .env.production file. If that fails warn and keep going
if [[ ! $loadEnv && -f ../../.env.production ]]
then
echo "[INFO] Did not detect that environmental variables are set, attempting to set via ../../.env.production"
source ../../.env.production
fi
if [[ ! $loadEnv ]]
then
echo "[WARN] Cannot find environemtnal variables, expect things to break ahead"
sleep 5s
fi
today=`date +"%Y%m%d"`
## The source file we are reading from
source=$1
## Here we take the top $minURIs lines of the file -- so we are in FIFO
## and pipe them thru uniq so we only pass unique URIs through to the fake relay
## This step easily cuts the total number of URIs in half and is the only way we can keep up
seed=`date +%Y%M%d%H%M%S%N`
backfillFile="backfilluris.$seed.txt"
cat "$source" | sort | uniq -u > "$backfillFile"
## Start looping through the unique URIs
cat "$backfillFile" | \
while read -r line
do
if [[ "$line" != "" ]]
then
uri=$line
echo "[INFO] RUN-FIREHOSE: Posting $uri"
sleep 1s
## Send it to the fake relay as a background job
curl -X "POST" "$fakeRelayHost" \
-H "Authorization: Bearer $fakeRelayKey" \
-H 'Content-Type: application/x-www-form-urlencoded; charset=utf-8' \
--data-urlencode "statusUrl=$uri" \
--no-progress-meter &
## Don't overload the system on open curls. Wait until they are below a certain amount to move on
## Or have some fun, set this as high as you like and turn your computer into a space heater!
curls=`ps -ef|grep curl|wc -l`
until [ $curls -lt $maxCurls ]
do
curls=`ps -ef|grep curl|wc -l`
echo "[INFO] RUN-FIREHOSE: Waiting for existing curls to finish, at $curls"
linesLeft=`cat "$source"|wc -l`
echo "[INFO] RUN-FIREHOSE:$linesLeft Total URIs left"
sleep 5s
done
fi
done
rm "$backfillFile"

View File

@ -19,7 +19,7 @@ while true
cat backfilluris.txt| \
while read -r uri
do
echo "FIREHOSE: Posting $uri"
echo "[INFO] RUN-FIREHOSE: Posting $uri"
## Send it to the fake relay as a background job
curl -X "POST" "$fakeRelayHost" \
@ -34,9 +34,9 @@ while true
until [ $curls -lt $maxCurls ]
do
curls=`ps -ef|grep curl|wc -l`
echo "FIREHOSE: Waiting for existing curls to finish, at $curls"
echo "[INFO] RUN-FIREHOSE: Waiting for existing curls to finish, at $curls"
linesLeft=`cat "$source"|wc -l`
echo "FIREHOSE: $linesLeft Total URIs left"
echo "[INFO] RUN-FIREHOSE:$linesLeft Total URIs left"
sleep 5s
done
@ -49,7 +49,7 @@ while true
until [ $linesLeft -gt $minURIs ]
do
linesLeft=`cat "$source"|wc -l`
echo "FIREHOSE: Waiting for more URIs to batch, currently at $linesLeft"
echo "[INFO] RUN-FIREHOSE: Waiting for more URIs to batch, currently at $linesLeft"
sleep 5s
done
done

View File

@ -2,16 +2,17 @@
echo > /config/urls.txt
echo > /config/hosts
echo > /config/hashtag-urls.txt
# Get federated hosts and begin to stream them
cat /config/domains-federated | grep -v "##" | while read -r line
do
#filter out empty lines
if [[ "$line" != "" ]]; then
echo "Opening federated line $line"
echo "[INFO] Opening federated line $line"
#Check for hashtags
if [[ "$line" == *" #"* ]]; then
if [[ "$line" == *" "* ]]; then
echo "$line has hashtags!"
@ -21,13 +22,13 @@ do
for tag in $tags
do
if [[ $tag != "" ]]; then
echo "Found tag $tag"
echo "[INFO] Found tag $tag"
# Create a url to fetch for each tag
echo "https://$host/api/v1/streaming/hashtag?tag=$tag $host" >> /config/urls.txt
fi
done
elif [[ "$line" != *" #"* ]]; then
echo "$line didn't have hashtags"
echo "[INFO] $line didn't have hashtags"
host=$line
echo "https://$line/api/v1/streaming/public $line" >> /config/urls.txt
fi
@ -41,12 +42,12 @@ cat /config/domains-local | grep -v "##" | while read -r line
do
#filter out empty lines
if [[ "$line" != "" ]]; then
echo "Opening federated line $line"
echo "[INFO] Opening federated line $line"
#Check for hashtags
if [[ "$line" == *" #"* ]]; then
if [[ "$line" == *" "* ]]; then
echo "$line has hashtags!"
echo "[INFO] $line has hashtags!"
# Get just the first field of the line, which is the host
host=`echo $line | cut -d " " -f 1`
@ -54,15 +55,15 @@ do
for tag in $tags
do
if [[ $tag != "" ]]; then
echo "Found tag $tag"
echo "[INFO] Found tag $tag"
# Create a url to fetch for each tag
echo "https://$host/api/v1/streaming/hashtag/local?tag=$tag $host" >> /config/urls.txt
fi
done
elif [[ "$line" != *" #"* ]]; then
echo "$line didn't have hashtags"
echo "[INFO] $line didn't have hashtags"
host=$line
echo "https://$line/api/v1/streaming/local $line" >> /config/urls.txt
echo "https://$line/api/v1/streaming/public/local $line" >> /config/urls.txt
fi
echo $host >> /config/hosts
fi
@ -79,11 +80,14 @@ done
cat /config/hashtag-urls.txt >> /config/urls.txt
cat /config/urls.txt | while read -r url
sort -u /config/urls.txt | while read -r url
do
echo "Opening $url to stream"
sleep 1s
./stream-url.sh $url &
if [[ ! $url == "#"* ]]
then
echo "[INFO] Opening $url to stream"
sleep $streamDelay
./stream-url.sh $url &
fi
done
if [[ $runFirehose == true ]]
@ -94,5 +98,6 @@ fi
## We don't have a health check, so just exit after an hour
# If your docker file has restart: always on this should gracefully exit, and
# then restart
sleep 1h
echo "[INFO] Container restart timoe is $restartTimeout"
sleep $restartTimeout
exit 0

View File

@ -1,37 +1,82 @@
url=$1 #A proper URL is all that should be sent to this script
host=$2
errors=0
if [[ "$url" == "" ]]
then
echo "Empty url, skipping" # Exit if an empty URL was sent
echo "[WARN] Empty url, skipping" # Exit if an empty URL was sent
exit 2
fi
# Check to see if domain name resolves. If not, exist
if [[ ! `dig $host +short` ]]
then
echo "[WARN] DNS Lookup failed for $host, skipping"
fi
echo "[INFO] Archive is $archive"
while true # Loop endlessly
do
today=`date +"%Y%m%d"`
echo "Starting to stream $url in 5 seconds"
echo "[INFO] Starting to stream $url in 5 seconds"
echo "[INFO] Archive status is $archive"
sleep 5s;
curl -X "GET" "$url" \
--no-progress-meter | \
tee -a "/data/$today.json" | \
grep url | \
sed 's/data://g' | \
# Im archive mode we'll only fetch the json stream to save resources from jq and sed
if [[ $archive != "true" ]]
then
#Not in archive mode
while read -r line
do
curl -X "GET" "$url" \
--no-progress-meter | \
tee -a "/data/$today.json" | \
grep url | \
sed 's/data://g' | \
if [[ $line == *"uri"* ]]
then
url=`echo $line | jq .url| sed 's/\"//g'`
uri=`echo $line | jq .uri| sed 's/\"//g'`
while read -r line
do
if [[ $line == *"uri"* ]]
then
url=`echo $line | jq .url| sed 's/\"//g'`
uri=`echo $line | jq .uri| sed 's/\"//g'`
echo "STREAMING from $host $url"
echo $uri >> "/data/$today.uris.txt"
echo "[INFO] Posting $url from $host"
echo $uri >> "/data/$today.uris.txt"
fi
done
# In archive mode
else
if [[ ! -d "/data/$today/" ]]
then
mkdir -p "/data/$today/"
fi
done
done
curl -X "GET" "$url" --no-progress-meter >> "/data/$today/$today.$host.json"
fi
# Basic exponential backoff
((++errors))
sleepseconds=$((errors*errors))
# Don't allow a back off for more than 5 minutes.
# Because we expect this container to reset occasionally to kill hanging curl processes
# a graceful exit will wait for all scripts to stop. So, it will take at least as long as $sleepseconds
# to stop.
if [[ $sleepseconds -gt 299 ]]
then
sleepseconds=300
fi
sleep $sleepseconds;
echo "[WARN] Streaming abrubtly stopped for $host, streaming will pause for $sleepseconds seconds before retrying."
done
## Exit 0 by default
exit 0