torrents.csv/scripts/prune.sh

44 lines
1.2 KiB
Bash
Raw Normal View History

# This prunes torrents.csv, removing those with too many columns, and sorts it
echo "Pruning torrents.csv ..."
cd ..
torrents_csv="`pwd`/torrents.csv"
scanned_out="`pwd`/infohashes_scanned.txt"
# Remove lines that don't have exactly 7 ';'
rg "^([^;]*;){7}[^;]+$" $torrents_csv > tmp_adds
mv tmp_adds $torrents_csv
# Remove random newlines
sed -i '/^$/d' $torrents_csv
# Extract the header
header=$(head -n1 $torrents_csv)
sed -i '1d' $torrents_csv
# Sort by seeders desc (so when we remove dups it removes the lower seeder counts)
# TODO this should actually probably do it by scraped date
2018-12-21 16:00:22 +00:00
# sort --field-separator=';' --key=5 -nr -o $torrents_csv $torrents_csv
# Remove dups
sort -u -t';' -k1,1 -o $torrents_csv $torrents_csv
2018-11-16 01:22:14 +00:00
sort -u -t';' -k2,2 -k3,3 -o $torrents_csv $torrents_csv
# Same for the infohashes scanned
sort -u -o $scanned_out $scanned_out
# Remove torrents with zero seeders
awk -F';' '$5>=1' $torrents_csv> tmp
mv tmp $torrents_csv
# Sort by infohash asc
sort --field-separator=';' --key=1 -o $torrents_csv $torrents_csv
# Add the header back in
sed -i "1i $header" $torrents_csv
2018-10-15 21:02:47 +00:00
#truncate -s -1 $torrents_csv # Removing last newline
echo "Pruning done."