Compare commits
29 Commits
hashtags-b
...
master
Author | SHA1 | Date |
---|---|---|
raynormast | a4614e886b | |
Christian Bolstad | 237840aa33 | |
Christian Bolstad | 237ed1303d | |
Christian Bolstad | e66b06503d | |
raynor | d466f59781 | |
Raynor | dcc2a551d2 | |
Raynor | 1ccee8638d | |
Raynor | c33b08165f | |
Raynor | 2f35a3cad6 | |
Raynor | 022ac7750f | |
Raynor | e4f1ac5688 | |
Raynor | 046839eba5 | |
Raynor | 2cc0d91ca4 | |
Raynor | 1f9d8adc14 | |
Raynor | 4c316296a5 | |
Raynor | 29eb750165 | |
Raynor | 64c2abf173 | |
Raynor | 1d7a2faf0b | |
Raynor | d2d7f4f621 | |
Raynor | 86d97397df | |
Raynor | 9d3f7f4873 | |
Raynor | 543f896d52 | |
Raynor | bd71dc16b4 | |
Raynor | 3a5a2c34e3 | |
Raynor | d3accb4d65 | |
Raynor | 2acfdf264f | |
Raynor | f561bb4521 | |
raynormast | e877becdb1 | |
raynor | eefa547a13 |
|
@ -1,27 +0,0 @@
|
|||
// For format details, see https://aka.ms/devcontainer.json. For config options, see the
|
||||
// README at: https://github.com/devcontainers/templates/tree/main/src/ubuntu
|
||||
{
|
||||
"name": "Ubuntu",
|
||||
// Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
|
||||
"image": "mcr.microsoft.com/devcontainers/base:focal",
|
||||
"features": {
|
||||
"ghcr.io/devcontainers/features/git:1": {},
|
||||
"ghcr.io/devcontainers/features/github-cli:1": {},
|
||||
"ghcr.io/eitsupi/devcontainer-features/jq-likes:1": {}
|
||||
}
|
||||
|
||||
// Features to add to the dev container. More info: https://containers.dev/features.
|
||||
// "features": {},
|
||||
|
||||
// Use 'forwardPorts' to make a list of ports inside the container available locally.
|
||||
// "forwardPorts": [],
|
||||
|
||||
// Use 'postCreateCommand' to run commands after the container is created.
|
||||
// "postCreateCommand": "uname -a",
|
||||
|
||||
// Configure tool-specific properties.
|
||||
// "customizations": {},
|
||||
|
||||
// Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root.
|
||||
// "remoteUser": "root"
|
||||
}
|
|
@ -1,12 +1,34 @@
|
|||
fakeRelayKey="YOUR--FAKE---RELAY---KEY"
|
||||
fakeRelayHost="https://your-fake-relay-url.YourPetMastodon.com"
|
||||
|
||||
## Do you want to send URIs to fake relay or just log them?
|
||||
## Set to false if you don't want to send URIs to your fakerelay. Generally this is only used for debugging
|
||||
runFirehose=true
|
||||
|
||||
# Maximum number of curls processes to run at once
|
||||
maxCurls=50
|
||||
## Maximum number of curl instances to be allowed to run. This is only used
|
||||
## if you send data to the relay
|
||||
maxCurls=500
|
||||
|
||||
# Minimum number of URIs to have before you process a batch.
|
||||
# Don't put this too low or you send over lots of duplicates and burn up your machine
|
||||
minURIs=100
|
||||
## Minimum number of posts to queue up before sending the to the relay.
|
||||
## This is more useful when you are streaming federated timelines from larger instances
|
||||
## because you will have a lot of duplicate URIs, wasting resources on your machine.
|
||||
## A URI batch is de-duplicated before being sent to the relay.
|
||||
##
|
||||
## On the other hand, if you are not following timelines that generate a lot of posts
|
||||
## a smaller value is appropriate.
|
||||
minURIs=100
|
||||
|
||||
## Archive mode will save the json stream but not parse it, not even into URIs
|
||||
## This will greatly save resources, but obviously will not send it to
|
||||
## the relay.
|
||||
##
|
||||
## Generally only used for debugging or archiving instance streams
|
||||
archive=false
|
||||
|
||||
## Restart timeout
|
||||
## Put whatever amount of time here you want the container to restart after
|
||||
## This will kill any hanging curl processes that may be taking up uncessary
|
||||
## resources
|
||||
restartTimeout=1h
|
||||
|
||||
## How long to delay starting a stream. Leave it here unless you are debugging
|
||||
streamDelay="0.1s"
|
|
@ -1,14 +1,10 @@
|
|||
.DS_Store
|
||||
data/20221217.json
|
||||
20221217.uris.txt
|
||||
.env.production
|
||||
config/domains-federated
|
||||
config/domains-local
|
||||
config/hashtags
|
||||
config/hosts
|
||||
.gitignore
|
||||
data/20221219.json
|
||||
.gitignore
|
||||
|
||||
config/hashtag-urls.txt
|
||||
config/urls.txt
|
||||
data/20221219.uris.txt
|
||||
log.txt
|
|
@ -5,7 +5,9 @@ RUN apt install -y \
|
|||
jq \
|
||||
curl \
|
||||
nano \
|
||||
vim
|
||||
vim \
|
||||
dnsutils \
|
||||
wget
|
||||
|
||||
COPY ./scripts /scripts
|
||||
WORKDIR /scripts
|
||||
|
|
4
TODO.md
4
TODO.md
|
@ -1,3 +1,5 @@
|
|||
1. Create language filter
|
||||
2. Create bot filter
|
||||
3.
|
||||
3. Add health check
|
||||
4. Use smaller docker base, probably alpine
|
||||
5. Create and set non-root user in dockerfile.
|
|
@ -0,0 +1,11 @@
|
|||
version: "3.9 "
|
||||
|
||||
services:
|
||||
firehose-archive:
|
||||
build: .
|
||||
image: fakefirehose:latest
|
||||
volumes:
|
||||
- ../data:/data
|
||||
- ./config:/config
|
||||
restart: always
|
||||
env_file: .env.production
|
|
@ -1,11 +1,11 @@
|
|||
version: "3.9 "
|
||||
|
||||
services:
|
||||
fake-firehose:
|
||||
firehose-archive:
|
||||
build: .
|
||||
image: fakefirehose:latest
|
||||
volumes:
|
||||
- ./data:/data
|
||||
- ../data:/data
|
||||
- ./config:/config
|
||||
restart: always
|
||||
env_file: .env.production
|
272
readme.md
272
readme.md
|
@ -1,10 +1,9 @@
|
|||
# Fake Firehose
|
||||
This project generates the mythical "firehose" relay that small Mastodon instances look for,
|
||||
at least to get content.
|
||||
This project is basically a shell/bash/text frontend for [fakerelay](https://github.com/g3rv4/FakeRelay)
|
||||
|
||||
It's a little crazy.
|
||||
It allows instances to fill their federated timelines from other instances that have public timelines.
|
||||
|
||||
Find a better way to do it and issue a pull request, or just tell me where your new repo is :)
|
||||
You can find the fakefirehose author at [@raynor@raynor.haus](https://raynor.haus/@raynor)
|
||||
|
||||
## How to run it
|
||||
|
||||
|
@ -14,9 +13,9 @@ In the config folder there are three files
|
|||
- domains-local
|
||||
- hashtags
|
||||
|
||||
If you want the full on public feed from an instance, put it in the domains-federated file, one domain per line
|
||||
If you want the full on public feed from an instance, put it in the domains-federated file, one domain per line.
|
||||
|
||||
If you only want the local feed from an instance, put it on the domains-local file, one domain per line
|
||||
If you only want the local feed from an instance, put it on the domains-local file, one domain per line.
|
||||
|
||||
If you want to follow a hash tag you either either add a hashtag after an instance in `domains-federated` or `domains-local`
|
||||
|
||||
|
@ -26,14 +25,101 @@ stream from mastodon.social
|
|||
Another example: if in `domains-local` you put `infosec.exchange #hacker` a stream will open to watch for the hashtag #hacker on the _local_ stream from infosec.exchange
|
||||
|
||||
## Docker
|
||||
Build docker
|
||||
To run it in docker -- recommended
|
||||
|
||||
Run docker
|
||||
1. Make sure you have [docker installed](https://docs.docker.com/engine/install/).
|
||||
2. From your shell: create a directory, it is recommended that you give it a relevant name
|
||||
3. Go into that directory and use `git clone https://github.com/raynormast/fake-firehose.git`
|
||||
4. Go into the created directory: `cd fake-firehose`
|
||||
5. `sudo docker build -t fakefirehose .`
|
||||
6. Edit your `docker-compose.yml` file as needed. **The biggest thing** is to watch the volumes. It is _highly_ recommended that you keep your data directory in the parent directory, and NOT the directory the git repo is in.
|
||||
7. Edit your `.env.production` file. The file is fairly well commented.
|
||||
8. Run `sudo docker compose -f docker-compose.yml`
|
||||
|
||||
### The hashtags file
|
||||
If you put ANY hashtags in here a stream will be opened for _every_ host in the `domains-federated` and `domains-local` file.
|
||||
The entire thing should look something like:
|
||||
```
|
||||
cd ~
|
||||
mkdir MastodonFireHose
|
||||
cd MastodonFirehose
|
||||
git pull https://github.com/raynormast/fake-firehose.git
|
||||
cd fake-firehose
|
||||
docker build -t fakefirehose .
|
||||
# Edit your docker-compose and .env.production here
|
||||
sudo docker compose -f docker-compose.yml up -d
|
||||
```
|
||||
|
||||
# Configuration
|
||||
|
||||
## tl;dr
|
||||
Your `./config` folder has three sample files, after editing you should have the following three files:
|
||||
```
|
||||
domains-federated
|
||||
domains-local
|
||||
hashtags
|
||||
```
|
||||
|
||||
**In each file, comments begin with `##` not the tradional single `#`.**
|
||||
|
||||
The syntax is the same for the domains files:
|
||||
```
|
||||
## Follow full timeline
|
||||
mastodon.instance
|
||||
|
||||
## Follow these hashtags from the timeline
|
||||
mastodon.instance #mastodon #relay
|
||||
```
|
||||
|
||||
The files are well commented.
|
||||
|
||||
|
||||
## domains-federated file
|
||||
This file has the full federated feeds of any instances you want fed to fakerelay.
|
||||
|
||||
Each line of the file should have the domain name of an instance whose federated timeline you want to follow.
|
||||
I.e.,
|
||||
```
|
||||
raynor.haus
|
||||
infosec.exchange
|
||||
```
|
||||
|
||||
This can generate a LOT of posts if you choose a large instance.
|
||||
|
||||
For example, if you use `mastodon.social` or `mas.to` you can expect your server to fall behind. `mastodon.social` generates 50,000 - 200,000 posts on the federated timeline per day.
|
||||
|
||||
It is recommended that you only use this file to:
|
||||
- follow hashtags
|
||||
- follow instances with small federated timelines, with content you want in yours
|
||||
|
||||
#### domains-federated hashtags
|
||||
The one time to use the federated timeline is to catch most posts with a specific hashtag.
|
||||
|
||||
Every word after after an instance domain is a hashtag to relay.
|
||||
|
||||
Example:
|
||||
|
||||
`mastodon.social fediblock fediverse mastodev mastoadmin`
|
||||
|
||||
Will only return posts from the mastodon.social federated feed with hashtags of `#fediblock`, `#fediverse`,
|
||||
`#mastodev`, and `#mastoadmin`.
|
||||
|
||||
The `#` is optional -- it is accepted simply to make the file more intuitive.
|
||||
|
||||
## domains-local file
|
||||
This file is identical to the `domains-federated` file except that it only recieves posts created on
|
||||
_that_ instance (the local timeline).
|
||||
|
||||
It is possible to keep up with the larger instances, such as `mastodon.social` if you only look at the
|
||||
local timeline.
|
||||
|
||||
|
||||
## hashtags file
|
||||
If you put ANY hashtags in here a stream will be opened for _every_ host in the `domains-federated` and `domains-local` file.
|
||||
|
||||
**It's purpose is for people or instances that want to find nearly every post with a particular hashtag**
|
||||
|
||||
_It can very quickly open up a lot of `curl` streams_
|
||||
|
||||
### Example
|
||||
`domains-federated` content:
|
||||
|
||||
```
|
||||
|
@ -56,6 +142,10 @@ Mastodon
|
|||
|
||||
will result in the following streams all opening:
|
||||
```shell
|
||||
https://mastodon.social/api/v1/streaming/public
|
||||
https://mas.to/api/v1/streaming/public
|
||||
https://aus.social/api/v1/streaming/public/local
|
||||
https://mastodon.nz/api/v1/streaming/public/local
|
||||
https://mastodon.social/api/v1/streaming/hashtag?tag=JohnMastodon
|
||||
https://mas.to/api/v1/streaming/hashtag?tag=JohnMastodon
|
||||
https://aus.social/api/v1/streaming/hashtag?tag=JohnMastodon
|
||||
|
@ -67,6 +157,164 @@ https://mastodon.nz/api/v1/streaming/hashtag?tag=Mastodon
|
|||
```
|
||||
|
||||
If you had a total of 5 lines in `domains-federated` and `domains-local` plus 3 entries in `hashtags`
|
||||
there would 5x5x3 = 75 new streams.
|
||||
there would 5 x 5 x 3 = 75 new streams.
|
||||
|
||||
I mean, you can do it, but you won't need your central heating system any more.
|
||||
Usually a more targeted approach is better.
|
||||
|
||||
It is recommended that you put hashtags in your `domains-federated` or `domains-local` files.
|
||||
|
||||
Your humble author's federated file currently looks like this:
|
||||
```
|
||||
mastodon.social infosec hacker hackers osint hive lockbit hackgroup apt vicesociety
|
||||
|
||||
mastodon.social blackmastodon blackfediverse poc actuallyautistic neurodivergent blacklivesmatter freechina antiracist neurodiversity blackhistory bipoc aapi asian asianamerican pacificislander indigenous native
|
||||
|
||||
mastodon.social fediblock fediverse mastodev mastoadmin
|
||||
mastodon.social apple politics vegan trailrunning church churchillfellowship christianity christiannationalism
|
||||
```
|
||||
|
||||
My `domains-local` file is:
|
||||
```
|
||||
## Fake Firehose will only take local posts from these domains
|
||||
|
||||
mastodon.social
|
||||
universeodon.com
|
||||
|
||||
## International English (if you aren't from the US) ###
|
||||
## mastodon.scot
|
||||
aus.social
|
||||
mastodon.nz
|
||||
respublicae.eu
|
||||
mastodon.au
|
||||
|
||||
### Tech ###
|
||||
partyon.xyz
|
||||
infosec.exchange
|
||||
ioc.exchange
|
||||
tech.lgbt
|
||||
techhub.social
|
||||
fosstodon.org
|
||||
appdot.net
|
||||
social.linux.pizza
|
||||
|
||||
journa.host
|
||||
climatejustice.social
|
||||
```
|
||||
|
||||
This generates an acceptable stream of posts for my federated timeline. The tags I follow on mastodon.social
|
||||
are those that are either few in number overall, or are harder to find on local timelines.
|
||||
|
||||
## .env.production
|
||||
tl;dir, This file is fairly well commented internally, just go at it.
|
||||
|
||||
**The sample file probably does not need any changes beyond your fakerelay information**
|
||||
|
||||
### options
|
||||
#### fakeRelayKey
|
||||
This needs to have the key you generated with fakerelay.
|
||||
|
||||
_Example_:
|
||||
`fakeRelayKey="MrNtYH+GjwDtJtR6YCx2O4dfasdf2349QtZaVni0rsbDryETCx9lHSZmzcOAv3Y8+4LiD8bFUZbnyl4w=="`
|
||||
|
||||
|
||||
#### fakeRelayHost
|
||||
The full URL to your fakerelay
|
||||
|
||||
_Example_:
|
||||
fakeRelayHost="https://fr-relay-post.myinstance.social/index"
|
||||
|
||||
#### runFirehose
|
||||
This controls whether the posts will actually be sent to your relay, or only collected in your /data folder.
|
||||
You almost certainly want this set at:
|
||||
|
||||
`runFirehose=true`
|
||||
|
||||
The _only_ reason to set it to `false` is for debugging, or logging posts from the fediverse.
|
||||
|
||||
#### maxCurls and minURIs
|
||||
These two options are closely related. `maxCurls` is the maximum number of `curl` processes you want to have
|
||||
running on your system at once. If you follow timelines with a lot of posts, you may need to limit this.
|
||||
|
||||
**Note** This always needs to be higher that the total number of instances + hashtags you have configured, because each one of those is a separate `curl` process
|
||||
|
||||
fake-firehose batches posts to de-duplicate them, `minURIs` is the size of that batch. If you have a lot of
|
||||
_federated_ posts coming in you will want to set this to a high number because a lot of them will be duplicates.
|
||||
|
||||
If you only use local timelines it doesn't matter, you will not have any duplicates.
|
||||
|
||||
It is a tradeoff between resources (and `curl` processes running) and how quickly you want to fill your
|
||||
instance's federated timeline.
|
||||
|
||||
_Example for a moderate number of incoming posts_:
|
||||
```
|
||||
## Max curl processes have not gotten out of control so this is absurdely high.
|
||||
maxCurls=2000
|
||||
|
||||
## Nearly all of the timelines I follow are local, so there are very few duplicates.
|
||||
minURIs=10
|
||||
```
|
||||
|
||||
#### archive
|
||||
Archive mode will save the json stream but not parse it, not even into URIs.
|
||||
This will greatly save resources, but obviously will not send it to
|
||||
the relay.
|
||||
|
||||
**The only reasons to use this is for debugging or logging posts from servers**.
|
||||
|
||||
You almost certainly want this set at:
|
||||
|
||||
```archive=false```
|
||||
|
||||
#### restartTimeout
|
||||
This is how long the docker image will run before exiting. As long as your `docker-compose` has `restart: always` set this simply restarts the image to kill any hung `curl` processes.
|
||||
|
||||
The only reason to set it high is if you have a lot of timelines you follow. Each one takes time to open up,
|
||||
so if you restart often you will miss more posts.
|
||||
|
||||
_Example:_
|
||||
|
||||
`restartTimeout=4h`
|
||||
|
||||
#### streamDelay
|
||||
This is only for debugging.
|
||||
|
||||
Keep it at:
|
||||
|
||||
`streamDelay="0.1s"`
|
||||
|
||||
# Data Directory
|
||||
Data is saved in the format of:
|
||||
```
|
||||
"%Y%m%d".uris.txt
|
||||
```
|
||||
|
||||
In archive mode the format is:
|
||||
```
|
||||
"/data/"%Y%m%d"/"%Y%m%d".$host.json"
|
||||
```
|
||||
|
||||
For example, if you set `archive=true` and had `mastodon.social` in your `domains-federated` or `domains-local` config, on January 1st, 2023 the json stream would be saved at
|
||||
```
|
||||
/data/20230101.mastodon.social.json
|
||||
```
|
||||
|
||||
# Misc
|
||||
## Backoff
|
||||
An exponential backoff starts if `curl` fails. It is rudimentary and maxes out at 15 minutes.
|
||||
|
||||
## DNS lookup
|
||||
Before a URL starts streaming fakefirehose will look up the DNS entry of the host. If it fails,
|
||||
the stream will not begin, _and will not attempt to begin again_ until the container is restarted.
|
||||
|
||||
## Permissions
|
||||
The permissions of the outputted data files will be set to `root` by default. This will get fixed
|
||||
in a future release.
|
||||
|
||||
# Why fake firehose?
|
||||
When I wrote this there were not other options I was aware of to fill a federated timeline of a small instance.
|
||||
The work of [Gervasio Marchand](https://mastodonte.tech/@g3rv4) is fantastic but still required programming knowledge to make use of.
|
||||
|
||||
I wanted the simplest setup and config I could create, without setting up an entirely new web UI.
|
||||
|
||||
There are a lot of things to do better, I'll work on the ones I have time and capability for. Otherwise, this project
|
||||
is practically begging to be re-written in python or something else.
|
|
@ -1,46 +0,0 @@
|
|||
host=$1
|
||||
type=$2
|
||||
hashtag=$1
|
||||
|
||||
if [[ "$host" == "" ]]
|
||||
then
|
||||
echo "Empty host: $host"
|
||||
exit 2
|
||||
fi
|
||||
|
||||
while true
|
||||
do
|
||||
today=`date +"%Y%m%d"`
|
||||
|
||||
case "$type" in
|
||||
"federated")
|
||||
fetch="https://$host/api/v1/streaming/public";;
|
||||
"local")
|
||||
fetch="https://$host/api/v1/streaming/public?local=true";;
|
||||
|
||||
esac
|
||||
|
||||
echo "Starting to stream $fetch in 5 seconds"
|
||||
|
||||
sleep 5s;
|
||||
|
||||
curl -X "GET" "$fetch" \
|
||||
--no-progress-meter | \
|
||||
tee -a "/data/$today.json" | \
|
||||
grep url | \
|
||||
sed 's/data://g' | \
|
||||
|
||||
while read -r line
|
||||
do
|
||||
|
||||
if [[ $line == *"uri"* ]]
|
||||
then
|
||||
url=`echo $line | jq .url| sed 's/\"//g'`
|
||||
uri=`echo $line | jq .uri| sed 's/\"//g'`
|
||||
|
||||
echo "STREAMING: $host $url"
|
||||
echo $uri >> "/data/$today.uris.txt"
|
||||
|
||||
fi
|
||||
done
|
||||
done
|
|
@ -0,0 +1,19 @@
|
|||
############################################################################
|
||||
##
|
||||
## This script exports URIs from a saved JSON stream. It uses the same logic
|
||||
## as stream-url.sh, except that it reads the JSON from a file.
|
||||
## It takes one argument, the input file name.
|
||||
##
|
||||
############################################################################
|
||||
|
||||
source=$1
|
||||
|
||||
cat "$source"|grep -A 1 "event: update"|grep "data:" | \
|
||||
while read -r line
|
||||
do
|
||||
if [[ $line == *"uri"* ]]
|
||||
then
|
||||
uri=`echo $line | sed 's/data: //g' | jq .uri| sed 's/\"//g'`
|
||||
echo "$uri"
|
||||
fi
|
||||
done
|
|
@ -0,0 +1,83 @@
|
|||
############################################################################
|
||||
##
|
||||
## This script sends URIs to fakerelay based on a saved JSON stream.
|
||||
## It takes one argument, the input file name.
|
||||
##
|
||||
############################################################################
|
||||
|
||||
## Look for environmental variables. Because this script may be run outside of docker
|
||||
## there is a good change that they are not set, if they are not, attempt to set them
|
||||
## via the .env.production file. If that fails warn and keep going
|
||||
if [[ ! $loadEnv && -f ../../.env.production ]]
|
||||
then
|
||||
echo "[INFO] Did not detect that environmental variables are set, attempting to set via ../../.env.production"
|
||||
source ../../.env.production
|
||||
fi
|
||||
|
||||
if [[ ! $loadEnv ]]
|
||||
then
|
||||
echo "[WARN] Cannot find environemtnal variables, expect things to break ahead"
|
||||
sleep 5s
|
||||
fi
|
||||
|
||||
today=`date +"%Y%m%d"`
|
||||
|
||||
## The source file we are reading from
|
||||
source=$1
|
||||
|
||||
while true
|
||||
do
|
||||
|
||||
if [[ -f "./maxcurls" ]]
|
||||
then
|
||||
maxCurls=`cat ./maxcurls`
|
||||
fi
|
||||
|
||||
|
||||
## Here we take the top 500 lines of the file -- so we are in FIFO
|
||||
## and pipe them thru uniq so we only pass unique URIs through to the fake relay
|
||||
## This step easily cuts the total number of URIs in half and is the only way we can keep up
|
||||
|
||||
## Make sure that you have the name number in the following two lines. In this repo, it is currently at 500
|
||||
seed=`date +%Y%M%d%H%M%S%N`
|
||||
backfillFile="backfilluris.$seed.txt"
|
||||
sedExpression="1,${minURIs}d"
|
||||
sed -i $sedExpression "$source"
|
||||
head "$source" -n $minURIs | sort | uniq -u > "$backfillFile"
|
||||
|
||||
## Start looping through the unique URIs
|
||||
cat "$backfillFile" | \
|
||||
while read -r line
|
||||
do
|
||||
if [[ "$line" != "" ]]
|
||||
then
|
||||
|
||||
uri=`echo $line | sed 's/data: //g' | jq .uri| sed 's/\"//g'`
|
||||
echo "[INFO] RUN-FIREHOSE: Posting $uri"
|
||||
|
||||
## Send it to the fake relay as a background job
|
||||
curl -X "POST" "$fakeRelayHost" \
|
||||
-H "Authorization: Bearer $fakeRelayKey" \
|
||||
-H 'Content-Type: application/x-www-form-urlencoded; charset=utf-8' \
|
||||
--data-urlencode "statusUrl=$uri" \
|
||||
--no-progress-meter &
|
||||
|
||||
## Don't overload the system on open curls. Wait until they are below a certain amount to move on
|
||||
## Or have some fun, set this as high as you like and turn your computer into a space heater!
|
||||
curls=`ps -ef|grep curl|wc -l`
|
||||
until [ $curls -lt $maxCurls ]
|
||||
do
|
||||
curls=`ps -ef|grep curl|wc -l`
|
||||
echo "[INFO] RUN-FIREHOSE: Waiting for existing curls to finish, at $curls"
|
||||
linesLeft=`cat "$source"|wc -l`
|
||||
echo "[INFO] RUN-FIREHOSE:$linesLeft Total URIs left"
|
||||
sleep 5s
|
||||
done
|
||||
fi
|
||||
|
||||
done
|
||||
|
||||
linesLeft=`cat "$source"|wc -l`
|
||||
echo "\n \n LINES LEFT: $linesLeft \n\n"
|
||||
rm "$backfillFile"
|
||||
done
|
|
@ -0,0 +1,73 @@
|
|||
############################################################################
|
||||
##
|
||||
## This script sends URIs to fakerelay based on a saved file of URIS, one
|
||||
## URI per line. It takes on argument, the filename with the URIs
|
||||
##
|
||||
## The significant difference is that the JSON stream has already been processed
|
||||
## so this script can post the URIs much faster, as it doesn't have to run the
|
||||
## JSON stream through jq
|
||||
##
|
||||
############################################################################
|
||||
|
||||
## Look for environmental variables. Because this script may be run outside of docker
|
||||
## there is a good change that they are not set, if they are not, attempt to set them
|
||||
## via the .env.production file. If that fails warn and keep going
|
||||
if [[ ! $loadEnv && -f ../../.env.production ]]
|
||||
then
|
||||
echo "[INFO] Did not detect that environmental variables are set, attempting to set via ../../.env.production"
|
||||
source ../../.env.production
|
||||
fi
|
||||
|
||||
if [[ ! $loadEnv ]]
|
||||
then
|
||||
echo "[WARN] Cannot find environemtnal variables, expect things to break ahead"
|
||||
sleep 5s
|
||||
fi
|
||||
|
||||
today=`date +"%Y%m%d"`
|
||||
|
||||
## The source file we are reading from
|
||||
source=$1
|
||||
|
||||
## Here we take the top $minURIs lines of the file -- so we are in FIFO
|
||||
## and pipe them thru uniq so we only pass unique URIs through to the fake relay
|
||||
## This step easily cuts the total number of URIs in half and is the only way we can keep up
|
||||
|
||||
seed=`date +%Y%M%d%H%M%S%N`
|
||||
backfillFile="backfilluris.$seed.txt"
|
||||
cat "$source" | sort | uniq -u > "$backfillFile"
|
||||
|
||||
## Start looping through the unique URIs
|
||||
cat "$backfillFile" | \
|
||||
while read -r line
|
||||
do
|
||||
if [[ "$line" != "" ]]
|
||||
then
|
||||
|
||||
uri=$line
|
||||
echo "[INFO] RUN-FIREHOSE: Posting $uri"
|
||||
sleep 1s
|
||||
|
||||
## Send it to the fake relay as a background job
|
||||
curl -X "POST" "$fakeRelayHost" \
|
||||
-H "Authorization: Bearer $fakeRelayKey" \
|
||||
-H 'Content-Type: application/x-www-form-urlencoded; charset=utf-8' \
|
||||
--data-urlencode "statusUrl=$uri" \
|
||||
--no-progress-meter &
|
||||
|
||||
## Don't overload the system on open curls. Wait until they are below a certain amount to move on
|
||||
## Or have some fun, set this as high as you like and turn your computer into a space heater!
|
||||
curls=`ps -ef|grep curl|wc -l`
|
||||
until [ $curls -lt $maxCurls ]
|
||||
do
|
||||
curls=`ps -ef|grep curl|wc -l`
|
||||
echo "[INFO] RUN-FIREHOSE: Waiting for existing curls to finish, at $curls"
|
||||
linesLeft=`cat "$source"|wc -l`
|
||||
echo "[INFO] RUN-FIREHOSE:$linesLeft Total URIs left"
|
||||
sleep 5s
|
||||
done
|
||||
fi
|
||||
|
||||
done
|
||||
|
||||
rm "$backfillFile"
|
|
@ -19,7 +19,7 @@ while true
|
|||
cat backfilluris.txt| \
|
||||
while read -r uri
|
||||
do
|
||||
echo "FIREHOSE: Posting $uri"
|
||||
echo "[INFO] RUN-FIREHOSE: Posting $uri"
|
||||
|
||||
## Send it to the fake relay as a background job
|
||||
curl -X "POST" "$fakeRelayHost" \
|
||||
|
@ -34,9 +34,9 @@ while true
|
|||
until [ $curls -lt $maxCurls ]
|
||||
do
|
||||
curls=`ps -ef|grep curl|wc -l`
|
||||
echo "FIREHOSE: Waiting for existing curls to finish, at $curls"
|
||||
echo "[INFO] RUN-FIREHOSE: Waiting for existing curls to finish, at $curls"
|
||||
linesLeft=`cat "$source"|wc -l`
|
||||
echo "FIREHOSE: $linesLeft Total URIs left"
|
||||
echo "[INFO] RUN-FIREHOSE:$linesLeft Total URIs left"
|
||||
sleep 5s
|
||||
done
|
||||
|
||||
|
@ -49,7 +49,7 @@ while true
|
|||
until [ $linesLeft -gt $minURIs ]
|
||||
do
|
||||
linesLeft=`cat "$source"|wc -l`
|
||||
echo "FIREHOSE: Waiting for more URIs to batch, currently at $linesLeft"
|
||||
echo "[INFO] RUN-FIREHOSE: Waiting for more URIs to batch, currently at $linesLeft"
|
||||
sleep 5s
|
||||
done
|
||||
done
|
||||
|
|
|
@ -2,16 +2,17 @@
|
|||
|
||||
echo > /config/urls.txt
|
||||
echo > /config/hosts
|
||||
echo > /config/hashtag-urls.txt
|
||||
|
||||
# Get federated hosts and begin to stream them
|
||||
cat /config/domains-federated | grep -v "##" | while read -r line
|
||||
do
|
||||
#filter out empty lines
|
||||
if [[ "$line" != "" ]]; then
|
||||
echo "Opening federated line $line"
|
||||
echo "[INFO] Opening federated line $line"
|
||||
|
||||
#Check for hashtags
|
||||
if [[ "$line" == *" #"* ]]; then
|
||||
if [[ "$line" == *" "* ]]; then
|
||||
|
||||
echo "$line has hashtags!"
|
||||
|
||||
|
@ -21,13 +22,13 @@ do
|
|||
for tag in $tags
|
||||
do
|
||||
if [[ $tag != "" ]]; then
|
||||
echo "Found tag $tag"
|
||||
echo "[INFO] Found tag $tag"
|
||||
# Create a url to fetch for each tag
|
||||
echo "https://$host/api/v1/streaming/hashtag?tag=$tag $host" >> /config/urls.txt
|
||||
fi
|
||||
done
|
||||
elif [[ "$line" != *" #"* ]]; then
|
||||
echo "$line didn't have hashtags"
|
||||
echo "[INFO] $line didn't have hashtags"
|
||||
host=$line
|
||||
echo "https://$line/api/v1/streaming/public $line" >> /config/urls.txt
|
||||
fi
|
||||
|
@ -41,12 +42,12 @@ cat /config/domains-local | grep -v "##" | while read -r line
|
|||
do
|
||||
#filter out empty lines
|
||||
if [[ "$line" != "" ]]; then
|
||||
echo "Opening federated line $line"
|
||||
echo "[INFO] Opening federated line $line"
|
||||
|
||||
#Check for hashtags
|
||||
if [[ "$line" == *" #"* ]]; then
|
||||
if [[ "$line" == *" "* ]]; then
|
||||
|
||||
echo "$line has hashtags!"
|
||||
echo "[INFO] $line has hashtags!"
|
||||
|
||||
# Get just the first field of the line, which is the host
|
||||
host=`echo $line | cut -d " " -f 1`
|
||||
|
@ -54,15 +55,15 @@ do
|
|||
for tag in $tags
|
||||
do
|
||||
if [[ $tag != "" ]]; then
|
||||
echo "Found tag $tag"
|
||||
echo "[INFO] Found tag $tag"
|
||||
# Create a url to fetch for each tag
|
||||
echo "https://$host/api/v1/streaming/hashtag/local?tag=$tag $host" >> /config/urls.txt
|
||||
fi
|
||||
done
|
||||
elif [[ "$line" != *" #"* ]]; then
|
||||
echo "$line didn't have hashtags"
|
||||
echo "[INFO] $line didn't have hashtags"
|
||||
host=$line
|
||||
echo "https://$line/api/v1/streaming/local $line" >> /config/urls.txt
|
||||
echo "https://$line/api/v1/streaming/public/local $line" >> /config/urls.txt
|
||||
fi
|
||||
echo $host >> /config/hosts
|
||||
fi
|
||||
|
@ -79,11 +80,14 @@ done
|
|||
|
||||
cat /config/hashtag-urls.txt >> /config/urls.txt
|
||||
|
||||
cat /config/urls.txt | while read -r url
|
||||
sort -u /config/urls.txt | while read -r url
|
||||
do
|
||||
echo "Opening $url to stream"
|
||||
sleep 1s
|
||||
./stream-url.sh $url &
|
||||
if [[ ! $url == "#"* ]]
|
||||
then
|
||||
echo "[INFO] Opening $url to stream"
|
||||
sleep $streamDelay
|
||||
./stream-url.sh $url &
|
||||
fi
|
||||
done
|
||||
|
||||
if [[ $runFirehose == true ]]
|
||||
|
@ -94,5 +98,6 @@ fi
|
|||
## We don't have a health check, so just exit after an hour
|
||||
# If your docker file has restart: always on this should gracefully exit, and
|
||||
# then restart
|
||||
sleep 1h
|
||||
echo "[INFO] Container restart timoe is $restartTimeout"
|
||||
sleep $restartTimeout
|
||||
exit 0
|
|
@ -1,37 +1,82 @@
|
|||
url=$1 #A proper URL is all that should be sent to this script
|
||||
host=$2
|
||||
errors=0
|
||||
|
||||
if [[ "$url" == "" ]]
|
||||
then
|
||||
echo "Empty url, skipping" # Exit if an empty URL was sent
|
||||
echo "[WARN] Empty url, skipping" # Exit if an empty URL was sent
|
||||
exit 2
|
||||
fi
|
||||
|
||||
# Check to see if domain name resolves. If not, exist
|
||||
if [[ ! `dig $host +short` ]]
|
||||
then
|
||||
echo "[WARN] DNS Lookup failed for $host, skipping"
|
||||
fi
|
||||
|
||||
echo "[INFO] Archive is $archive"
|
||||
|
||||
while true # Loop endlessly
|
||||
do
|
||||
|
||||
today=`date +"%Y%m%d"`
|
||||
|
||||
echo "Starting to stream $url in 5 seconds"
|
||||
echo "[INFO] Starting to stream $url in 5 seconds"
|
||||
echo "[INFO] Archive status is $archive"
|
||||
|
||||
sleep 5s;
|
||||
|
||||
curl -X "GET" "$url" \
|
||||
--no-progress-meter | \
|
||||
tee -a "/data/$today.json" | \
|
||||
grep url | \
|
||||
sed 's/data://g' | \
|
||||
# Im archive mode we'll only fetch the json stream to save resources from jq and sed
|
||||
if [[ $archive != "true" ]]
|
||||
then
|
||||
#Not in archive mode
|
||||
|
||||
while read -r line
|
||||
do
|
||||
curl -X "GET" "$url" \
|
||||
--no-progress-meter | \
|
||||
tee -a "/data/$today.json" | \
|
||||
grep url | \
|
||||
sed 's/data://g' | \
|
||||
|
||||
if [[ $line == *"uri"* ]]
|
||||
then
|
||||
url=`echo $line | jq .url| sed 's/\"//g'`
|
||||
uri=`echo $line | jq .uri| sed 's/\"//g'`
|
||||
while read -r line
|
||||
do
|
||||
if [[ $line == *"uri"* ]]
|
||||
then
|
||||
url=`echo $line | jq .url| sed 's/\"//g'`
|
||||
uri=`echo $line | jq .uri| sed 's/\"//g'`
|
||||
|
||||
echo "STREAMING from $host $url"
|
||||
echo $uri >> "/data/$today.uris.txt"
|
||||
echo "[INFO] Posting $url from $host"
|
||||
echo $uri >> "/data/$today.uris.txt"
|
||||
fi
|
||||
done
|
||||
# In archive mode
|
||||
else
|
||||
|
||||
if [[ ! -d "/data/$today/" ]]
|
||||
then
|
||||
mkdir -p "/data/$today/"
|
||||
fi
|
||||
done
|
||||
done
|
||||
|
||||
curl -X "GET" "$url" --no-progress-meter >> "/data/$today/$today.$host.json"
|
||||
fi
|
||||
|
||||
# Basic exponential backoff
|
||||
((++errors))
|
||||
sleepseconds=$((errors*errors))
|
||||
|
||||
# Don't allow a back off for more than 5 minutes.
|
||||
# Because we expect this container to reset occasionally to kill hanging curl processes
|
||||
# a graceful exit will wait for all scripts to stop. So, it will take at least as long as $sleepseconds
|
||||
# to stop.
|
||||
if [[ $sleepseconds -gt 299 ]]
|
||||
then
|
||||
sleepseconds=300
|
||||
fi
|
||||
|
||||
sleep $sleepseconds;
|
||||
|
||||
echo "[WARN] Streaming abrubtly stopped for $host, streaming will pause for $sleepseconds seconds before retrying."
|
||||
|
||||
done
|
||||
|
||||
## Exit 0 by default
|
||||
exit 0
|
Loading…
Reference in New Issue