Improving pruning.

This commit is contained in:
Dessalines 2019-01-28 16:49:29 -08:00
parent f5b1282a47
commit 48dc083784
4 changed files with 77 additions and 7 deletions

View File

@ -19,13 +19,10 @@ header=$(head -n1 $torrents_csv_tmp)
sed -i '1d' $torrents_csv_tmp sed -i '1d' $torrents_csv_tmp
# Sort by seeders desc (so when we remove dups it removes the lower seeder counts) # Sort by seeders desc (so when we remove dups it removes the lower seeder counts)
# TODO this should actually probably do it by scraped date
# sort --field-separator=';' --key=5 -nr -o $torrents_csv_tmp $torrents_csv_tmp
# Remove dups, keeping the last ones # Remove dups, keeping the last ones
sort -r -t';' -k1,1 -o $torrents_csv_tmp $torrents_csv_tmp sort -r -t';' -k1,1 -o $torrents_csv_tmp $torrents_csv_tmp
sort -r -u -t';' -k1,1 -o $torrents_csv_tmp $torrents_csv_tmp sort -r -u -t';' -k1,1 -o $torrents_csv_tmp $torrents_csv_tmp
# sort -u -t';' -k2,2 -k8,8 -o $torrents_csv_tmp $torrents_csv_tmp
# Same for the infohashes scanned # Same for the infohashes scanned

View File

@ -0,0 +1,59 @@
Refetching seeder counts ...
Fetching seeds...
tmp_aa
Fetching seeds...
tmp_ab
Fetching seeds...
tmp_ac
Fetching seeds...
tmp_ad
Fetching seeds...
tmp_ae
Fetching seeds...
tmp_af
Fetching seeds...
tmp_ag
Fetching seeds...
tmp_ah
Fetching seeds...
tmp_ai
Fetching seeds...
tmp_aj
Fetching seeds...
tmp_ak
Fetching seeds...
tmp_al
Fetching seeds...
tmp_am
Fetching seeds...
tmp_an
Fetching seeds...
tmp_ao
Fetching seeds...
tmp_ap
Fetching seeds...
tmp_aq
Fetching seeds...
tmp_ar
Fetching seeds...
tmp_as
Fetching seeds...
tmp_at
Fetching seeds...
tmp_au
Fetching seeds...
tmp_av
Fetching seeds...
tmp_aw
Fetching seeds...
tmp_ax
Fetching seeds...
tmp_ay
Fetching seeds...
tmp_az
Fetching seeds...
tmp_ba
Fetching seeds...
tmp_bb
Fetching seeds...
tmp_bc

View File

@ -2,6 +2,7 @@
echo "Refetching seeder counts ..." echo "Refetching seeder counts ..."
cd .. cd ..
torrents_csv="`pwd`/torrents.csv" torrents_csv="`pwd`/torrents.csv"
torrents_removed="`pwd`/torrents_removed.csv"
prune_currents_tmps="`pwd`/prune_currents_tmps" prune_currents_tmps="`pwd`/prune_currents_tmps"
mkdir $prune_currents_tmps mkdir $prune_currents_tmps
cd $prune_currents_tmps cd $prune_currents_tmps
@ -15,7 +16,6 @@ sed -i '1d' tmp
cat tmp | cut -d ';' -f1 > tmp2 cat tmp | cut -d ';' -f1 > tmp2
mv tmp2 tmp mv tmp2 tmp
mkdir prune_currents_tmps
# Split these up into 2000 file batches # Split these up into 2000 file batches
split -l 2000 tmp tmp_ split -l 2000 tmp tmp_
@ -32,9 +32,10 @@ for f in tmp_*; do
done done
# Remove those lines from the file # Remove those lines from the file
rg -vwF -f no_seeds $torrents_csv > torrents_removed.csv rg -vwF -f no_seeds $torrents_csv > $torrents_removed
rm tmp_* cd ..
rm $prune_currents_tmps
rm health rm health

13
scripts/scan_torrents.out Normal file
View File

@ -0,0 +1,13 @@
Torrents dir=/home/tyler/Tyhous_HD2/git/torrents.csv/torrents
torrent-tracker-health installed.
~/Tyhous_HD2/git/torrents.csv/torrents ~/Tyhous_HD2/git/torrents.csv
sub dir: 1
Torrents.csv updated with new torrents.
sub dir: 2
jq: error (at health.out:12295): date "+047876-11-23T23" does not match format "%Y-%m-%dT%H:%M"
There were no results for some reason.
sub dir: 3
Torrents.csv updated with new torrents.
~/Tyhous_HD2/git/torrents.csv
Pruning torrents.csv ...
Pruning done.