2019-11-06 18:14:50 -07:00
|
|
|
#!/bin/bash
|
|
|
|
|
2018-10-07 22:43:12 -07:00
|
|
|
# This prunes torrents.csv, removing those with too many columns, and sorts it
|
2018-11-25 16:53:55 -07:00
|
|
|
echo "Pruning torrents.csv ..."
|
2019-02-06 10:02:13 -08:00
|
|
|
pushd ..
|
2018-10-07 22:43:12 -07:00
|
|
|
torrents_csv="`pwd`/torrents.csv"
|
2019-01-28 15:01:03 -08:00
|
|
|
torrents_csv_tmp="`pwd`/torrents_prune_tmp.csv"
|
2019-01-24 14:45:18 -08:00
|
|
|
scanned_out="`pwd`/infohashes_scanned.txt"
|
|
|
|
|
2019-06-25 14:28:44 -07:00
|
|
|
torrent_files_csv="`pwd`/torrent_files.csv"
|
|
|
|
torrent_files_csv_tmp="`pwd`/torrent_files_tmp.csv"
|
|
|
|
|
2019-01-28 15:01:03 -08:00
|
|
|
cp $torrents_csv $torrents_csv_tmp
|
2018-10-07 22:43:12 -07:00
|
|
|
|
|
|
|
# Remove lines that don't have exactly 7 ';'
|
2019-01-28 15:01:03 -08:00
|
|
|
rg "^([^;]*;){7}[^;]+$" $torrents_csv_tmp > tmp_adds
|
|
|
|
mv tmp_adds $torrents_csv_tmp
|
2018-10-07 22:43:12 -07:00
|
|
|
|
|
|
|
# Remove random newlines
|
2019-02-06 10:02:13 -08:00
|
|
|
sed -i '/^$/d' $torrents_csv_tmp
|
2018-10-07 22:43:12 -07:00
|
|
|
|
|
|
|
# Extract the header
|
2019-01-28 15:01:03 -08:00
|
|
|
header=$(head -n1 $torrents_csv_tmp)
|
|
|
|
sed -i '1d' $torrents_csv_tmp
|
2018-10-07 22:43:12 -07:00
|
|
|
|
2018-12-02 11:39:57 -07:00
|
|
|
# Sort by seeders desc (so when we remove dups it removes the lower seeder counts)
|
2019-01-28 15:01:03 -08:00
|
|
|
|
|
|
|
# Remove dups, keeping the last ones
|
2019-02-20 15:00:24 -08:00
|
|
|
tac $torrents_csv_tmp | sort -u -t';' -k1,1 -o $torrents_csv_tmp
|
2018-10-07 22:43:12 -07:00
|
|
|
|
2019-01-24 14:45:18 -08:00
|
|
|
# Same for the infohashes scanned
|
|
|
|
sort -u -o $scanned_out $scanned_out
|
|
|
|
|
|
|
|
# Remove torrents with zero seeders
|
2019-01-28 15:01:03 -08:00
|
|
|
awk -F';' '$5>=1' $torrents_csv_tmp> tmp
|
|
|
|
mv tmp $torrents_csv_tmp
|
2019-01-24 14:45:18 -08:00
|
|
|
|
2018-12-02 11:39:57 -07:00
|
|
|
# Sort by infohash asc
|
2019-01-28 15:01:03 -08:00
|
|
|
sort --field-separator=';' --key=1 -o $torrents_csv_tmp $torrents_csv_tmp
|
2018-10-07 22:43:12 -07:00
|
|
|
|
|
|
|
# Add the header back in
|
2019-01-28 15:01:03 -08:00
|
|
|
sed -i "1i $header" $torrents_csv_tmp
|
2018-10-15 14:02:47 -07:00
|
|
|
#truncate -s -1 $torrents_csv # Removing last newline
|
2018-10-07 22:43:12 -07:00
|
|
|
|
2019-01-28 15:01:03 -08:00
|
|
|
mv $torrents_csv_tmp $torrents_csv
|
2018-10-07 22:43:12 -07:00
|
|
|
|
2019-06-25 14:28:44 -07:00
|
|
|
# Torrent files cleanup
|
|
|
|
echo "Pruning torrent_files.csv ..."
|
|
|
|
cp $torrent_files_csv $torrent_files_csv_tmp
|
|
|
|
|
|
|
|
# Header
|
|
|
|
header=$(head -n1 $torrent_files_csv_tmp)
|
|
|
|
sed -i '1d' $torrent_files_csv_tmp
|
|
|
|
|
2019-11-06 18:14:50 -07:00
|
|
|
# Remove dups, keeping the last ones
|
|
|
|
tac $torrent_files_csv_tmp | sort -u -t';' -k1,1 -k2,2 -o $torrent_files_csv_tmp
|
|
|
|
|
2019-06-25 14:28:44 -07:00
|
|
|
# Same for the infohashes scanned
|
|
|
|
sort --field-separator=';' --key=1,2 -o $torrent_files_csv_tmp $torrent_files_csv_tmp
|
|
|
|
|
|
|
|
# Add the header back in
|
|
|
|
sed -i "1i $header" $torrent_files_csv_tmp
|
|
|
|
|
|
|
|
mv $torrent_files_csv_tmp $torrent_files_csv
|
|
|
|
|
2019-02-06 10:02:13 -08:00
|
|
|
popd
|
|
|
|
|
2019-01-28 16:49:29 -08:00
|
|
|
echo "Pruning done."
|