# This prunes torrents.csv, removing those with too many columns, and sorts it echo "Pruning torrents.csv ..." cd .. torrents_csv="`pwd`/torrents.csv" torrents_csv_tmp="`pwd`/torrents_prune_tmp.csv" scanned_out="`pwd`/infohashes_scanned.txt" cp $torrents_csv $torrents_csv_tmp # Remove lines that don't have exactly 7 ';' rg "^([^;]*;){7}[^;]+$" $torrents_csv_tmp > tmp_adds mv tmp_adds $torrents_csv_tmp # Remove random newlines sed -i '/^$/d' $torrents_csv_tmp # Extract the header header=$(head -n1 $torrents_csv_tmp) sed -i '1d' $torrents_csv_tmp # Sort by seeders desc (so when we remove dups it removes the lower seeder counts) # TODO this should actually probably do it by scraped date # sort --field-separator=';' --key=5 -nr -o $torrents_csv_tmp $torrents_csv_tmp # Remove dups, keeping the last ones sort -r -t';' -k1,1 -o $torrents_csv_tmp $torrents_csv_tmp sort -r -u -t';' -k1,1 -o $torrents_csv_tmp $torrents_csv_tmp # sort -u -t';' -k2,2 -k8,8 -o $torrents_csv_tmp $torrents_csv_tmp # Same for the infohashes scanned sort -u -o $scanned_out $scanned_out # Remove torrents with zero seeders awk -F';' '$5>=1' $torrents_csv_tmp> tmp mv tmp $torrents_csv_tmp # Sort by infohash asc sort --field-separator=';' --key=1 -o $torrents_csv_tmp $torrents_csv_tmp # Add the header back in sed -i "1i $header" $torrents_csv_tmp #truncate -s -1 $torrents_csv # Removing last newline mv $torrents_csv_tmp $torrents_csv echo "Pruning done."