Separate federated vs local feeds

This commit is contained in:
raynor 2022-12-17 20:45:49 -05:00
parent 0fc0470eb1
commit df6920efc7
10 changed files with 111 additions and 20 deletions

12
.env.production.sample Normal file
View file

@ -0,0 +1,12 @@
fakeRelayKey="YOUR--FAKE---RELAY---KEY"
fakeRelayHost="https://your-fake-relay-url.YourPetMastodon.com"
## Do you want to send URIs to fake relay or just log them?
runFirehose=true
# Maximum number of curls processes to run at once
maxCurls=50
# Minimum number of URIs to have before you process a batch.
# Don't put this too low or you send over lots of duplicates and burn up your machine
minURIs=100

4
.gitignore vendored
View file

@ -1,3 +1,7 @@
.DS_Store
data/20221217.json
20221217.uris.txt
.env.production
config/domains-federated
config/domains-local
config/hashtags

View file

@ -0,0 +1,8 @@
## Fake Firehose will only take all public posts from these domains
## This is the true firehose, use it carefully or blow up your server
### International English (if you aren't from the US) ###
mastodon.scot
aus.social
mastodon.nz
respublicae.eu

View file

@ -0,0 +1,11 @@
## Fake Firehose will only take local posts from these domains
### Tech ###
infosec.exchange
ioc.exchange
tech.lgbt
techhub.social
fosstodon.org
### News & Politics ###
journa.host

0
config/hashtags.sample Normal file
View file

View file

@ -7,4 +7,5 @@ services:
volumes:
- ./data:/data
- ./config:/config
restart: always
restart: always
env_file: .env.production

View file

@ -8,6 +8,18 @@ Find a better way to do it and issue a pull request, or just tell me where your
## How to run it
In the config folder there are three files
- domains-federated
- domains-local
- hashtags
If you want the full on public feed from an instance, put it in the domains-federated file, one domain per line
If you only want the local feed from an instance, put it on the domains-local file, one domain per line
If you want to follow a hashtag you're out of luck because I didn't get that far. But it will go into the hashtags file.
Build docker
Run docker

33
scripts/get-stream.sh Executable file → Normal file
View file

@ -1,8 +1,33 @@
host=$1
type=$2
if [[ "$host" == "" ]]
then
echo "Empty host: $host"
exit 2
fi
while true
do
today=`date +"%Y%m%d"`
curl -X "GET" "https://$host/api/v1/streaming/public?&local=true" \
case "$type" in
"federated")
fetch="https://$host/api/v1/streaming/public";;
"local")
fetch="https://$host/api/v1/streaming/public?local=true";;
"hashtags")
fetch="https://$host/api/v1/streaming/hashtag?tag=linux"
echo "Sorry, hash tags aren't implemented yet :("
exit 1
;;
esac
echo "Starting to stream $fetch in 5 seconds"
sleep 5s;
curl -X "GET" "$fetch" \
--no-progress-meter | \
tee -a "/data/$today.json" | \
grep url | \
@ -16,9 +41,9 @@ do
url=`echo $line | jq .url| sed 's/\"//g'`
uri=`echo $line | jq .uri| sed 's/\"//g'`
echo "$host $url"
echo $uri >> "$today.uris.txt"
echo "STREAMING: $host $url"
echo $uri >> "/data/$today.uris.txt"
fi
done
done
done

27
scripts/run-firehose.sh Executable file → Normal file
View file

@ -4,38 +4,40 @@ while true
## This assumes that we have other scripts that are writing to the file called
## $source, which here is today's date appended with .uris.txt
today=`date +"%Y%m%d"`
source="$today.uris.txt"
source="/data/$today.uris.txt"
## Here we take the top 500 lines of the file -- so we are in FIFO
## and pipe them thru uniq so we only pass unique URIs through to the fake relay
## This step easily cuts the total number of URIs in half and is the only way we can keep up
## Make sure that you have the name number in the following two lines. In this repo, it is currently at 500
head "$source" -n 500 | sed 's/\"//g' | sort | uniq -u > backfilluris.txt
sed -i '1,500d' "$source"
head "$source" -n $minURIs | sed 's/\"//g' | sort | uniq -u > backfilluris.txt
sedExpression="1,${minURIs}d"
sed -i $sedExpression "$source"
## Start looping through the unique URIs
cat backfilluris.txt| \
while read -r uri
do
# echo BACKFILL $url;
echo "FIREHOSE: Posting $uri"
## Send it to the fake relay as a background job
curl -X "POST" "$fakeRelayHost" \
-H "Authorization: Bearer $fakeRelayKey" \
-H 'Content-Type: application/x-www-form-urlencoded; charset=utf-8' \
--data-urlencode "statusUrl=$uri" &
--data-urlencode "statusUrl=$uri" \
--no-progress-meter &
## Don't overload the system on open curls. Wait until they are below a certain amount to move on
## Or have some fun, set this as high as you like and turn your computer into a space heater!
curls=`ps -ef|grep curl|wc -l`
until [ $curls -lt 100 ]
until [ $curls -lt $maxCurls ]
do
curls=`ps -ef|grep curl|wc -l`
echo "Waiting for existing curls to finish, at $curls"
echo "FIREHOSE: Waiting for existing curls to finish, at $curls"
linesLeft=`cat "$source"|wc -l`
echo "$linesLeft Total URIs left"
sleep 1s
echo "FIREHOSE: $linesLeft Total URIs left"
sleep 5s
done
done
@ -44,11 +46,10 @@ while true
## Wait until the queue is at least 500 lines long, less than that
## and there are not enough lines to see if there are duplicates.
until [ $linesLeft -gt 500 ]
until [ $linesLeft -gt $minURIs ]
do
linesLeft=`cat "$source"|wc -l`
sleep 1s
echo "Waiting for more URIs to batch, currently at $linesLeft"
echo "FIREHOSE: Waiting for more URIs to batch, currently at $linesLeft"
sleep 5s
done
done

21
scripts/start-firehose.sh Executable file → Normal file
View file

@ -1,8 +1,25 @@
#!/bin/bash
cat /config/domains|grep -v "#"|while read -r host
cat /config/domains-federated|grep -v "#"|while read -r host
do
/scripts/get-stream.sh $host &
if [[ "$host" != "" ]]
then
/scripts/get-stream.sh $host "federated" &
fi
done
cat /config/domains-local|grep -v "#"|while read -r host
do
if [[ "$host" != "" ]]
then
/scripts/get-stream.sh $host "local" &
fi
done
if [[ $runFirehose == true ]]
then
/scripts/run-firehose.sh &
fi
## Don't let the container exit
while true; do sleep 1; done