torrents.csv/scripts/prune.sh

69 lines
1.8 KiB
Bash
Raw Permalink Normal View History

2019-11-07 01:14:50 +00:00
#!/bin/bash
# This prunes torrents.csv, removing those with too many columns, and sorts it
echo "Pruning torrents.csv ..."
pushd ..
torrents_csv="`pwd`/torrents.csv"
2019-01-28 23:01:03 +00:00
torrents_csv_tmp="`pwd`/torrents_prune_tmp.csv"
scanned_out="`pwd`/infohashes_scanned.txt"
torrent_files_csv="`pwd`/torrent_files.csv"
torrent_files_csv_tmp="`pwd`/torrent_files_tmp.csv"
2019-01-28 23:01:03 +00:00
cp $torrents_csv $torrents_csv_tmp
# Remove lines that don't have exactly 7 ';'
2019-01-28 23:01:03 +00:00
rg "^([^;]*;){7}[^;]+$" $torrents_csv_tmp > tmp_adds
mv tmp_adds $torrents_csv_tmp
# Remove random newlines
sed -i '/^$/d' $torrents_csv_tmp
# Extract the header
2019-01-28 23:01:03 +00:00
header=$(head -n1 $torrents_csv_tmp)
sed -i '1d' $torrents_csv_tmp
# Sort by seeders desc (so when we remove dups it removes the lower seeder counts)
2019-01-28 23:01:03 +00:00
# Remove dups, keeping the last ones
2019-02-20 23:00:24 +00:00
tac $torrents_csv_tmp | sort -u -t';' -k1,1 -o $torrents_csv_tmp
# Same for the infohashes scanned
sort -u -o $scanned_out $scanned_out
# Remove torrents with zero seeders
2019-01-28 23:01:03 +00:00
awk -F';' '$5>=1' $torrents_csv_tmp> tmp
mv tmp $torrents_csv_tmp
# Sort by infohash asc
2019-01-28 23:01:03 +00:00
sort --field-separator=';' --key=1 -o $torrents_csv_tmp $torrents_csv_tmp
# Add the header back in
2019-01-28 23:01:03 +00:00
sed -i "1i $header" $torrents_csv_tmp
2018-10-15 21:02:47 +00:00
#truncate -s -1 $torrents_csv # Removing last newline
2019-01-28 23:01:03 +00:00
mv $torrents_csv_tmp $torrents_csv
# Torrent files cleanup
echo "Pruning torrent_files.csv ..."
cp $torrent_files_csv $torrent_files_csv_tmp
# Header
header=$(head -n1 $torrent_files_csv_tmp)
sed -i '1d' $torrent_files_csv_tmp
2019-11-07 01:14:50 +00:00
# Remove dups, keeping the last ones
tac $torrent_files_csv_tmp | sort -u -t';' -k1,1 -k2,2 -o $torrent_files_csv_tmp
# Same for the infohashes scanned
sort --field-separator=';' --key=1,2 -o $torrent_files_csv_tmp $torrent_files_csv_tmp
# Add the header back in
sed -i "1i $header" $torrent_files_csv_tmp
mv $torrent_files_csv_tmp $torrent_files_csv
popd
2019-01-29 00:49:29 +00:00
echo "Pruning done."