From 70223ebbef10f8fae1a64efb230f43eb000006c6 Mon Sep 17 00:00:00 2001
From: Dessalines <tyhou13@gmx.com>
Date: Mon, 28 Jan 2019 15:01:03 -0800
Subject: [PATCH] Bunch of fixes to scanning and pruning

---
 scripts/prune.sh          | 34 ++++++++++++----------
 scripts/prune_currents.sh | 52 +++++++++++++++++++++++++++++++++
 scripts/scan_torrents.sh  | 61 +++++++++++++++++++--------------------
 3 files changed, 101 insertions(+), 46 deletions(-)
 create mode 100755 scripts/prune_currents.sh

diff --git a/scripts/prune.sh b/scripts/prune.sh
index 64e5c40..e4f1b26 100755
--- a/scripts/prune.sh
+++ b/scripts/prune.sh
@@ -2,42 +2,46 @@
 echo "Pruning torrents.csv ..."
 cd ..
 torrents_csv="`pwd`/torrents.csv"
+torrents_csv_tmp="`pwd`/torrents_prune_tmp.csv"
 scanned_out="`pwd`/infohashes_scanned.txt"
 
+cp $torrents_csv $torrents_csv_tmp
 
 # Remove lines that don't have exactly 7 ';'
-rg "^([^;]*;){7}[^;]+$" $torrents_csv > tmp_adds
-mv tmp_adds $torrents_csv
+rg "^([^;]*;){7}[^;]+$" $torrents_csv_tmp > tmp_adds
+mv tmp_adds $torrents_csv_tmp
 
 # Remove random newlines
-sed -i '/^$/d' $torrents_csv 
+sed -i '/^$/d' $torrents_csv_tmp 
 
 # Extract the header
-header=$(head -n1 $torrents_csv) 
-sed -i '1d' $torrents_csv
+header=$(head -n1 $torrents_csv_tmp)
+sed -i '1d' $torrents_csv_tmp
 
 # Sort by seeders desc (so when we remove dups it removes the lower seeder counts)
 # TODO this should actually probably do it by scraped date
-# sort --field-separator=';' --key=5 -nr -o $torrents_csv $torrents_csv
+# sort --field-separator=';' --key=5 -nr -o $torrents_csv_tmp $torrents_csv_tmp
+
+# Remove dups, keeping the last ones
+sort -r -t';' -k1,1 -o $torrents_csv_tmp $torrents_csv_tmp
+sort -r -u -t';' -k1,1 -o $torrents_csv_tmp $torrents_csv_tmp
+# sort -u -t';' -k2,2 -k8,8 -o $torrents_csv_tmp $torrents_csv_tmp
 
-# Remove dups
-sort -u -t';' -k1,1 -o $torrents_csv $torrents_csv
-sort -u -t';' -k2,2 -k3,3 -o $torrents_csv $torrents_csv
 
 # Same for the infohashes scanned
 sort -u -o $scanned_out $scanned_out
 
 # Remove torrents with zero seeders
-awk -F';' '$5>=1' $torrents_csv> tmp
-mv tmp $torrents_csv
+awk -F';' '$5>=1' $torrents_csv_tmp> tmp
+mv tmp $torrents_csv_tmp
 
 # Sort by infohash asc
-sort --field-separator=';' --key=1 -o $torrents_csv $torrents_csv
+sort --field-separator=';' --key=1 -o $torrents_csv_tmp $torrents_csv_tmp
 
 # Add the header back in
-sed  -i "1i $header" $torrents_csv
+sed  -i "1i $header" $torrents_csv_tmp
 #truncate -s -1 $torrents_csv # Removing last newline
 
-echo "Pruning done."
-
+mv $torrents_csv_tmp $torrents_csv
 
+echo "Pruning done."
\ No newline at end of file
diff --git a/scripts/prune_currents.sh b/scripts/prune_currents.sh
new file mode 100755
index 0000000..f678f69
--- /dev/null
+++ b/scripts/prune_currents.sh
@@ -0,0 +1,52 @@
+# This refetches the seeder counts for everthing in torrents.csv, and updates the seeder counts
+echo "Refetching seeder counts ..."
+cd ..
+torrents_csv="`pwd`/torrents.csv"
+prune_currents_tmps="`pwd`/prune_currents_tmps"
+mkdir $prune_currents_tmps
+cd $prune_currents_tmps
+
+cp $torrents_csv tmp
+
+# Extract the header
+header=$(head -n1 tmp) 
+sed -i '1d' tmp
+
+cat tmp | cut -d ';' -f1 > tmp2
+mv tmp2 tmp
+
+mkdir prune_currents_tmps
+# Split these up into 2000 file batches
+split -l 2000 tmp tmp_
+
+> no_seeds
+for f in tmp_*; do
+  echo "Fetching seeds..."
+  echo $f
+  torrent-tracker-health --torrent "$f" > health
+
+  # Select the infohashes with zero seeders
+  # append to a no seeds file
+  jq '.results[] | select(.seeders==0) | .hash' health | tr -d \" >> no_seeds
+  rm $f
+done
+
+# Remove those lines from the file
+rg -vwF -f no_seeds $torrents_csv > torrents_removed.csv
+
+rm tmp_*
+rm health
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/scripts/scan_torrents.sh b/scripts/scan_torrents.sh
index 2eaef8a..edae0d7 100755
--- a/scripts/scan_torrents.sh
+++ b/scripts/scan_torrents.sh
@@ -6,11 +6,7 @@ cd ..
 torrents_csv="`pwd`/torrents.csv"
 scanned_out="`pwd`/infohashes_scanned.txt"
 tmp_torrent_dir="`pwd`/tmp_torrents"
-names_out="`pwd`/names.out"
-health_out="`pwd`/health.out"
 touch $scanned_out
-touch $names_out
-touch $health_out
 
 help="Run ./scan_torrents.sh [TORRENTS_DIR] \nor goto https://gitlab.com/dessalines/torrents.csv for more help"
 if [ "$1" == "-h" ] || [ -z "$1" ]; then
@@ -32,47 +28,50 @@ fi
 
 # Loop over all torrents
 pushd $torrents_dir
-# for torrent_file in *.torrent; do
 # Copy the unscanned torrent files to a temp dir
 mkdir $tmp_torrent_dir
 find `pwd` -name "*.torrent" | sort -n | grep -vFf $scanned_out | while read torrent_file ; do
   cp "$torrent_file" "$tmp_torrent_dir"
-  echo $(basename "$torrent_file" .torrent) >> $names_out
 done
 
-# Delete null torrents from the temp dir
-find $tmp_torrent_dir -name "*.torrent" -size -2k -delete
+# Split these into many directories ( since torrent-tracker-health can't do too many full size torrents)
+cd $tmp_torrent_dir
+# i=1;while read l;do mkdir $i;mv $l $((i++));done< <(ls|xargs -n100)
+ls|parallel -n100 mkdir {#}\;mv {} {#}
 
-if [ -z "$(ls -A $tmp_torrent_dir)" ]; then
-  echo "No new torrents."
-else
-  # Scrape it
-  torrent-tracker-health --torrent "$tmp_torrent_dir" > $health_out
+for tmp_torrent_dir_sub in *; do
+  echo "sub dir: $tmp_torrent_dir_sub"
+  find $tmp_torrent_dir_sub -type f  -exec basename {} .torrent \; > names.out
+  
+  # Delete null torrents from the temp dir
+  find $tmp_torrent_dir_sub -name "*.torrent" -size -2k -delete
 
-  echo -e "$health_out"
-
-  # Convert the json results to csv format
-  results=$(jq -r '.results | map([.hash, .name, .length, (.created | .[0:16] | strptime("%Y-%m-%dT%H:%M") | mktime), .seeders, .leechers, .completed, (now | floor)] | join(";")) | join("\n")' $health_out)
-# // "1970-01-01T00:00:00.000Z"
-  # If there are no results
-  if [ -z "$results" ]; then
-    echo "There were no results for some reason."
+  if [ -z "$(ls -A $tmp_torrent_dir_sub)" ]; then
+    echo "No new torrents."
   else
-    # Update the torrents.csv and infohashes scanned file
-    echo -e "$results" >> $torrents_csv
-    cat "$names_out" >> $scanned_out
+    # Scrape it
+    torrent-tracker-health --torrent "$tmp_torrent_dir_sub"/ > health.out
 
-    popd
-    cd scripts
-    . prune.sh
+    # Convert the json results to csv format
+    results=$(jq -r '.results | map([.hash, .name, .length, (.created | .[0:16] | strptime("%Y-%m-%dT%H:%M") | mktime), .seeders, .leechers, .completed, (now | floor)] | join(";")) | join("\n")' health.out)
+
+    # If there are no results
+    if [ -z "$results" ]; then
+      echo "There were no results for some reason."
+    else
+      echo "Torrents.csv updated with new torrents."
+      # Update the torrents.csv and infohashes scanned file
+      echo -e "$results" >> $torrents_csv
+      cat names.out >> $scanned_out
+    fi
   fi
+done
 
-
-fi
+popd
+cd scripts
+. prune.sh
 
 # Remove the temp dir
 rm -rf "$tmp_torrent_dir"
-rm "$names_out"
-rm "$health_out"