From 70223ebbef10f8fae1a64efb230f43eb000006c6 Mon Sep 17 00:00:00 2001 From: Dessalines Date: Mon, 28 Jan 2019 15:01:03 -0800 Subject: [PATCH] Bunch of fixes to scanning and pruning --- scripts/prune.sh | 34 ++++++++++++---------- scripts/prune_currents.sh | 52 +++++++++++++++++++++++++++++++++ scripts/scan_torrents.sh | 61 +++++++++++++++++++-------------------- 3 files changed, 101 insertions(+), 46 deletions(-) create mode 100755 scripts/prune_currents.sh diff --git a/scripts/prune.sh b/scripts/prune.sh index 64e5c40..e4f1b26 100755 --- a/scripts/prune.sh +++ b/scripts/prune.sh @@ -2,42 +2,46 @@ echo "Pruning torrents.csv ..." cd .. torrents_csv="`pwd`/torrents.csv" +torrents_csv_tmp="`pwd`/torrents_prune_tmp.csv" scanned_out="`pwd`/infohashes_scanned.txt" +cp $torrents_csv $torrents_csv_tmp # Remove lines that don't have exactly 7 ';' -rg "^([^;]*;){7}[^;]+$" $torrents_csv > tmp_adds -mv tmp_adds $torrents_csv +rg "^([^;]*;){7}[^;]+$" $torrents_csv_tmp > tmp_adds +mv tmp_adds $torrents_csv_tmp # Remove random newlines -sed -i '/^$/d' $torrents_csv +sed -i '/^$/d' $torrents_csv_tmp # Extract the header -header=$(head -n1 $torrents_csv) -sed -i '1d' $torrents_csv +header=$(head -n1 $torrents_csv_tmp) +sed -i '1d' $torrents_csv_tmp # Sort by seeders desc (so when we remove dups it removes the lower seeder counts) # TODO this should actually probably do it by scraped date -# sort --field-separator=';' --key=5 -nr -o $torrents_csv $torrents_csv +# sort --field-separator=';' --key=5 -nr -o $torrents_csv_tmp $torrents_csv_tmp + +# Remove dups, keeping the last ones +sort -r -t';' -k1,1 -o $torrents_csv_tmp $torrents_csv_tmp +sort -r -u -t';' -k1,1 -o $torrents_csv_tmp $torrents_csv_tmp +# sort -u -t';' -k2,2 -k8,8 -o $torrents_csv_tmp $torrents_csv_tmp -# Remove dups -sort -u -t';' -k1,1 -o $torrents_csv $torrents_csv -sort -u -t';' -k2,2 -k3,3 -o $torrents_csv $torrents_csv # Same for the infohashes scanned sort -u -o $scanned_out $scanned_out # Remove torrents with zero seeders -awk -F';' '$5>=1' $torrents_csv> tmp -mv tmp $torrents_csv +awk -F';' '$5>=1' $torrents_csv_tmp> tmp +mv tmp $torrents_csv_tmp # Sort by infohash asc -sort --field-separator=';' --key=1 -o $torrents_csv $torrents_csv +sort --field-separator=';' --key=1 -o $torrents_csv_tmp $torrents_csv_tmp # Add the header back in -sed -i "1i $header" $torrents_csv +sed -i "1i $header" $torrents_csv_tmp #truncate -s -1 $torrents_csv # Removing last newline -echo "Pruning done." - +mv $torrents_csv_tmp $torrents_csv +echo "Pruning done." \ No newline at end of file diff --git a/scripts/prune_currents.sh b/scripts/prune_currents.sh new file mode 100755 index 0000000..f678f69 --- /dev/null +++ b/scripts/prune_currents.sh @@ -0,0 +1,52 @@ +# This refetches the seeder counts for everthing in torrents.csv, and updates the seeder counts +echo "Refetching seeder counts ..." +cd .. +torrents_csv="`pwd`/torrents.csv" +prune_currents_tmps="`pwd`/prune_currents_tmps" +mkdir $prune_currents_tmps +cd $prune_currents_tmps + +cp $torrents_csv tmp + +# Extract the header +header=$(head -n1 tmp) +sed -i '1d' tmp + +cat tmp | cut -d ';' -f1 > tmp2 +mv tmp2 tmp + +mkdir prune_currents_tmps +# Split these up into 2000 file batches +split -l 2000 tmp tmp_ + +> no_seeds +for f in tmp_*; do + echo "Fetching seeds..." + echo $f + torrent-tracker-health --torrent "$f" > health + + # Select the infohashes with zero seeders + # append to a no seeds file + jq '.results[] | select(.seeders==0) | .hash' health | tr -d \" >> no_seeds + rm $f +done + +# Remove those lines from the file +rg -vwF -f no_seeds $torrents_csv > torrents_removed.csv + +rm tmp_* +rm health + + + + + + + + + + + + + + diff --git a/scripts/scan_torrents.sh b/scripts/scan_torrents.sh index 2eaef8a..edae0d7 100755 --- a/scripts/scan_torrents.sh +++ b/scripts/scan_torrents.sh @@ -6,11 +6,7 @@ cd .. torrents_csv="`pwd`/torrents.csv" scanned_out="`pwd`/infohashes_scanned.txt" tmp_torrent_dir="`pwd`/tmp_torrents" -names_out="`pwd`/names.out" -health_out="`pwd`/health.out" touch $scanned_out -touch $names_out -touch $health_out help="Run ./scan_torrents.sh [TORRENTS_DIR] \nor goto https://gitlab.com/dessalines/torrents.csv for more help" if [ "$1" == "-h" ] || [ -z "$1" ]; then @@ -32,47 +28,50 @@ fi # Loop over all torrents pushd $torrents_dir -# for torrent_file in *.torrent; do # Copy the unscanned torrent files to a temp dir mkdir $tmp_torrent_dir find `pwd` -name "*.torrent" | sort -n | grep -vFf $scanned_out | while read torrent_file ; do cp "$torrent_file" "$tmp_torrent_dir" - echo $(basename "$torrent_file" .torrent) >> $names_out done -# Delete null torrents from the temp dir -find $tmp_torrent_dir -name "*.torrent" -size -2k -delete +# Split these into many directories ( since torrent-tracker-health can't do too many full size torrents) +cd $tmp_torrent_dir +# i=1;while read l;do mkdir $i;mv $l $((i++));done< <(ls|xargs -n100) +ls|parallel -n100 mkdir {#}\;mv {} {#} -if [ -z "$(ls -A $tmp_torrent_dir)" ]; then - echo "No new torrents." -else - # Scrape it - torrent-tracker-health --torrent "$tmp_torrent_dir" > $health_out +for tmp_torrent_dir_sub in *; do + echo "sub dir: $tmp_torrent_dir_sub" + find $tmp_torrent_dir_sub -type f -exec basename {} .torrent \; > names.out + + # Delete null torrents from the temp dir + find $tmp_torrent_dir_sub -name "*.torrent" -size -2k -delete - echo -e "$health_out" - - # Convert the json results to csv format - results=$(jq -r '.results | map([.hash, .name, .length, (.created | .[0:16] | strptime("%Y-%m-%dT%H:%M") | mktime), .seeders, .leechers, .completed, (now | floor)] | join(";")) | join("\n")' $health_out) -# // "1970-01-01T00:00:00.000Z" - # If there are no results - if [ -z "$results" ]; then - echo "There were no results for some reason." + if [ -z "$(ls -A $tmp_torrent_dir_sub)" ]; then + echo "No new torrents." else - # Update the torrents.csv and infohashes scanned file - echo -e "$results" >> $torrents_csv - cat "$names_out" >> $scanned_out + # Scrape it + torrent-tracker-health --torrent "$tmp_torrent_dir_sub"/ > health.out - popd - cd scripts - . prune.sh + # Convert the json results to csv format + results=$(jq -r '.results | map([.hash, .name, .length, (.created | .[0:16] | strptime("%Y-%m-%dT%H:%M") | mktime), .seeders, .leechers, .completed, (now | floor)] | join(";")) | join("\n")' health.out) + + # If there are no results + if [ -z "$results" ]; then + echo "There were no results for some reason." + else + echo "Torrents.csv updated with new torrents." + # Update the torrents.csv and infohashes scanned file + echo -e "$results" >> $torrents_csv + cat names.out >> $scanned_out + fi fi +done - -fi +popd +cd scripts +. prune.sh # Remove the temp dir rm -rf "$tmp_torrent_dir" -rm "$names_out" -rm "$health_out"