Bunch of fixes to scanning and pruning
This commit is contained in:
parent
2b72758365
commit
70223ebbef
|
@ -2,42 +2,46 @@
|
||||||
echo "Pruning torrents.csv ..."
|
echo "Pruning torrents.csv ..."
|
||||||
cd ..
|
cd ..
|
||||||
torrents_csv="`pwd`/torrents.csv"
|
torrents_csv="`pwd`/torrents.csv"
|
||||||
|
torrents_csv_tmp="`pwd`/torrents_prune_tmp.csv"
|
||||||
scanned_out="`pwd`/infohashes_scanned.txt"
|
scanned_out="`pwd`/infohashes_scanned.txt"
|
||||||
|
|
||||||
|
cp $torrents_csv $torrents_csv_tmp
|
||||||
|
|
||||||
# Remove lines that don't have exactly 7 ';'
|
# Remove lines that don't have exactly 7 ';'
|
||||||
rg "^([^;]*;){7}[^;]+$" $torrents_csv > tmp_adds
|
rg "^([^;]*;){7}[^;]+$" $torrents_csv_tmp > tmp_adds
|
||||||
mv tmp_adds $torrents_csv
|
mv tmp_adds $torrents_csv_tmp
|
||||||
|
|
||||||
# Remove random newlines
|
# Remove random newlines
|
||||||
sed -i '/^$/d' $torrents_csv
|
sed -i '/^$/d' $torrents_csv_tmp
|
||||||
|
|
||||||
# Extract the header
|
# Extract the header
|
||||||
header=$(head -n1 $torrents_csv)
|
header=$(head -n1 $torrents_csv_tmp)
|
||||||
sed -i '1d' $torrents_csv
|
sed -i '1d' $torrents_csv_tmp
|
||||||
|
|
||||||
# Sort by seeders desc (so when we remove dups it removes the lower seeder counts)
|
# Sort by seeders desc (so when we remove dups it removes the lower seeder counts)
|
||||||
# TODO this should actually probably do it by scraped date
|
# TODO this should actually probably do it by scraped date
|
||||||
# sort --field-separator=';' --key=5 -nr -o $torrents_csv $torrents_csv
|
# sort --field-separator=';' --key=5 -nr -o $torrents_csv_tmp $torrents_csv_tmp
|
||||||
|
|
||||||
|
# Remove dups, keeping the last ones
|
||||||
|
sort -r -t';' -k1,1 -o $torrents_csv_tmp $torrents_csv_tmp
|
||||||
|
sort -r -u -t';' -k1,1 -o $torrents_csv_tmp $torrents_csv_tmp
|
||||||
|
# sort -u -t';' -k2,2 -k8,8 -o $torrents_csv_tmp $torrents_csv_tmp
|
||||||
|
|
||||||
# Remove dups
|
|
||||||
sort -u -t';' -k1,1 -o $torrents_csv $torrents_csv
|
|
||||||
sort -u -t';' -k2,2 -k3,3 -o $torrents_csv $torrents_csv
|
|
||||||
|
|
||||||
# Same for the infohashes scanned
|
# Same for the infohashes scanned
|
||||||
sort -u -o $scanned_out $scanned_out
|
sort -u -o $scanned_out $scanned_out
|
||||||
|
|
||||||
# Remove torrents with zero seeders
|
# Remove torrents with zero seeders
|
||||||
awk -F';' '$5>=1' $torrents_csv> tmp
|
awk -F';' '$5>=1' $torrents_csv_tmp> tmp
|
||||||
mv tmp $torrents_csv
|
mv tmp $torrents_csv_tmp
|
||||||
|
|
||||||
# Sort by infohash asc
|
# Sort by infohash asc
|
||||||
sort --field-separator=';' --key=1 -o $torrents_csv $torrents_csv
|
sort --field-separator=';' --key=1 -o $torrents_csv_tmp $torrents_csv_tmp
|
||||||
|
|
||||||
# Add the header back in
|
# Add the header back in
|
||||||
sed -i "1i $header" $torrents_csv
|
sed -i "1i $header" $torrents_csv_tmp
|
||||||
#truncate -s -1 $torrents_csv # Removing last newline
|
#truncate -s -1 $torrents_csv # Removing last newline
|
||||||
|
|
||||||
|
mv $torrents_csv_tmp $torrents_csv
|
||||||
|
|
||||||
echo "Pruning done."
|
echo "Pruning done."
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,52 @@
|
||||||
|
# This refetches the seeder counts for everthing in torrents.csv, and updates the seeder counts
|
||||||
|
echo "Refetching seeder counts ..."
|
||||||
|
cd ..
|
||||||
|
torrents_csv="`pwd`/torrents.csv"
|
||||||
|
prune_currents_tmps="`pwd`/prune_currents_tmps"
|
||||||
|
mkdir $prune_currents_tmps
|
||||||
|
cd $prune_currents_tmps
|
||||||
|
|
||||||
|
cp $torrents_csv tmp
|
||||||
|
|
||||||
|
# Extract the header
|
||||||
|
header=$(head -n1 tmp)
|
||||||
|
sed -i '1d' tmp
|
||||||
|
|
||||||
|
cat tmp | cut -d ';' -f1 > tmp2
|
||||||
|
mv tmp2 tmp
|
||||||
|
|
||||||
|
mkdir prune_currents_tmps
|
||||||
|
# Split these up into 2000 file batches
|
||||||
|
split -l 2000 tmp tmp_
|
||||||
|
|
||||||
|
> no_seeds
|
||||||
|
for f in tmp_*; do
|
||||||
|
echo "Fetching seeds..."
|
||||||
|
echo $f
|
||||||
|
torrent-tracker-health --torrent "$f" > health
|
||||||
|
|
||||||
|
# Select the infohashes with zero seeders
|
||||||
|
# append to a no seeds file
|
||||||
|
jq '.results[] | select(.seeders==0) | .hash' health | tr -d \" >> no_seeds
|
||||||
|
rm $f
|
||||||
|
done
|
||||||
|
|
||||||
|
# Remove those lines from the file
|
||||||
|
rg -vwF -f no_seeds $torrents_csv > torrents_removed.csv
|
||||||
|
|
||||||
|
rm tmp_*
|
||||||
|
rm health
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -6,11 +6,7 @@ cd ..
|
||||||
torrents_csv="`pwd`/torrents.csv"
|
torrents_csv="`pwd`/torrents.csv"
|
||||||
scanned_out="`pwd`/infohashes_scanned.txt"
|
scanned_out="`pwd`/infohashes_scanned.txt"
|
||||||
tmp_torrent_dir="`pwd`/tmp_torrents"
|
tmp_torrent_dir="`pwd`/tmp_torrents"
|
||||||
names_out="`pwd`/names.out"
|
|
||||||
health_out="`pwd`/health.out"
|
|
||||||
touch $scanned_out
|
touch $scanned_out
|
||||||
touch $names_out
|
|
||||||
touch $health_out
|
|
||||||
|
|
||||||
help="Run ./scan_torrents.sh [TORRENTS_DIR] \nor goto https://gitlab.com/dessalines/torrents.csv for more help"
|
help="Run ./scan_torrents.sh [TORRENTS_DIR] \nor goto https://gitlab.com/dessalines/torrents.csv for more help"
|
||||||
if [ "$1" == "-h" ] || [ -z "$1" ]; then
|
if [ "$1" == "-h" ] || [ -z "$1" ]; then
|
||||||
|
@ -32,47 +28,50 @@ fi
|
||||||
|
|
||||||
# Loop over all torrents
|
# Loop over all torrents
|
||||||
pushd $torrents_dir
|
pushd $torrents_dir
|
||||||
# for torrent_file in *.torrent; do
|
|
||||||
# Copy the unscanned torrent files to a temp dir
|
# Copy the unscanned torrent files to a temp dir
|
||||||
mkdir $tmp_torrent_dir
|
mkdir $tmp_torrent_dir
|
||||||
find `pwd` -name "*.torrent" | sort -n | grep -vFf $scanned_out | while read torrent_file ; do
|
find `pwd` -name "*.torrent" | sort -n | grep -vFf $scanned_out | while read torrent_file ; do
|
||||||
cp "$torrent_file" "$tmp_torrent_dir"
|
cp "$torrent_file" "$tmp_torrent_dir"
|
||||||
echo $(basename "$torrent_file" .torrent) >> $names_out
|
|
||||||
done
|
done
|
||||||
|
|
||||||
# Delete null torrents from the temp dir
|
# Split these into many directories ( since torrent-tracker-health can't do too many full size torrents)
|
||||||
find $tmp_torrent_dir -name "*.torrent" -size -2k -delete
|
cd $tmp_torrent_dir
|
||||||
|
# i=1;while read l;do mkdir $i;mv $l $((i++));done< <(ls|xargs -n100)
|
||||||
|
ls|parallel -n100 mkdir {#}\;mv {} {#}
|
||||||
|
|
||||||
if [ -z "$(ls -A $tmp_torrent_dir)" ]; then
|
for tmp_torrent_dir_sub in *; do
|
||||||
echo "No new torrents."
|
echo "sub dir: $tmp_torrent_dir_sub"
|
||||||
else
|
find $tmp_torrent_dir_sub -type f -exec basename {} .torrent \; > names.out
|
||||||
# Scrape it
|
|
||||||
torrent-tracker-health --torrent "$tmp_torrent_dir" > $health_out
|
|
||||||
|
|
||||||
echo -e "$health_out"
|
# Delete null torrents from the temp dir
|
||||||
|
find $tmp_torrent_dir_sub -name "*.torrent" -size -2k -delete
|
||||||
|
|
||||||
# Convert the json results to csv format
|
if [ -z "$(ls -A $tmp_torrent_dir_sub)" ]; then
|
||||||
results=$(jq -r '.results | map([.hash, .name, .length, (.created | .[0:16] | strptime("%Y-%m-%dT%H:%M") | mktime), .seeders, .leechers, .completed, (now | floor)] | join(";")) | join("\n")' $health_out)
|
echo "No new torrents."
|
||||||
# // "1970-01-01T00:00:00.000Z"
|
|
||||||
# If there are no results
|
|
||||||
if [ -z "$results" ]; then
|
|
||||||
echo "There were no results for some reason."
|
|
||||||
else
|
else
|
||||||
# Update the torrents.csv and infohashes scanned file
|
# Scrape it
|
||||||
echo -e "$results" >> $torrents_csv
|
torrent-tracker-health --torrent "$tmp_torrent_dir_sub"/ > health.out
|
||||||
cat "$names_out" >> $scanned_out
|
|
||||||
|
|
||||||
popd
|
# Convert the json results to csv format
|
||||||
cd scripts
|
results=$(jq -r '.results | map([.hash, .name, .length, (.created | .[0:16] | strptime("%Y-%m-%dT%H:%M") | mktime), .seeders, .leechers, .completed, (now | floor)] | join(";")) | join("\n")' health.out)
|
||||||
. prune.sh
|
|
||||||
|
# If there are no results
|
||||||
|
if [ -z "$results" ]; then
|
||||||
|
echo "There were no results for some reason."
|
||||||
|
else
|
||||||
|
echo "Torrents.csv updated with new torrents."
|
||||||
|
# Update the torrents.csv and infohashes scanned file
|
||||||
|
echo -e "$results" >> $torrents_csv
|
||||||
|
cat names.out >> $scanned_out
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
popd
|
||||||
fi
|
cd scripts
|
||||||
|
. prune.sh
|
||||||
|
|
||||||
# Remove the temp dir
|
# Remove the temp dir
|
||||||
rm -rf "$tmp_torrent_dir"
|
rm -rf "$tmp_torrent_dir"
|
||||||
rm "$names_out"
|
|
||||||
rm "$health_out"
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue