From cf71b8f6df1abdcdbc8403e42001b84091f0f455 Mon Sep 17 00:00:00 2001 From: Dessalines Date: Wed, 6 Feb 2019 10:02:13 -0800 Subject: [PATCH] scan_torrent_files.js now filters out torrents not in torrents.csv. scan_torrents.sh now runs the file scanner. some additional fixes to pruning. --- .gitignore | 1 + new_torrents_fetcher/src/main.rs | 3 ++- scripts/build_sqlite.sh | 3 +-- scripts/prune.sh | 6 ++++-- scripts/prune_currents.sh | 2 +- scripts/scan_torrent_files.js | 26 ++++++++++++++++++++++---- scripts/scan_torrent_files.sh | 10 ---------- scripts/scan_torrents.sh | 15 ++++++++------- 8 files changed, 39 insertions(+), 27 deletions(-) delete mode 100755 scripts/scan_torrent_files.sh diff --git a/.gitignore b/.gitignore index 311c8f6..c6ec303 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ run.out old_greps.sh torrents.db .vscode +backups diff --git a/new_torrents_fetcher/src/main.rs b/new_torrents_fetcher/src/main.rs index f2488b8..8679af3 100644 --- a/new_torrents_fetcher/src/main.rs +++ b/new_torrents_fetcher/src/main.rs @@ -44,8 +44,9 @@ fn main() { // torrentz2(save_dir); magnetdl(save_dir); - skytorrents(save_dir); leetx(save_dir); + skytorrents(save_dir); + if let Some(t) = matches.value_of("TORRENTS_CSV_FILE") { torrents_csv_scan(Path::new(t), save_dir); diff --git a/scripts/build_sqlite.sh b/scripts/build_sqlite.sh index 9772a4e..37f2639 100755 --- a/scripts/build_sqlite.sh +++ b/scripts/build_sqlite.sh @@ -3,7 +3,7 @@ csv_file="${TORRENTS_CSV_FILE:-../torrents.csv}" db_file="${TORRENTS_CSV_DB_FILE:-../torrents.db}" torrent_files_json="`pwd`/../torrent_files.json" -echo "Creating temporary torrents.db file..." +echo "Creating temporary torrents.db file from $csv_file ..." # Remove double quotes for csv import sed 's/\"//g' $csv_file > torrents_removed_quotes.csv @@ -11,7 +11,6 @@ sed 's/\"//g' $csv_file > torrents_removed_quotes.csv # Sort by seeders desc before insert sort --field-separator=';' --key=5 -nr -o torrents_removed_quotes.csv torrents_removed_quotes.csv -rm db_tmp touch db_tmp sqlite3 -batch db_tmp <<"EOF" diff --git a/scripts/prune.sh b/scripts/prune.sh index 8fe0441..a551998 100755 --- a/scripts/prune.sh +++ b/scripts/prune.sh @@ -1,6 +1,6 @@ # This prunes torrents.csv, removing those with too many columns, and sorts it echo "Pruning torrents.csv ..." -cd .. +pushd .. torrents_csv="`pwd`/torrents.csv" torrents_csv_tmp="`pwd`/torrents_prune_tmp.csv" scanned_out="`pwd`/infohashes_scanned.txt" @@ -12,7 +12,7 @@ rg "^([^;]*;){7}[^;]+$" $torrents_csv_tmp > tmp_adds mv tmp_adds $torrents_csv_tmp # Remove random newlines -sed -i '/^$/d' $torrents_csv_tmp +sed -i '/^$/d' $torrents_csv_tmp # Extract the header header=$(head -n1 $torrents_csv_tmp) @@ -41,4 +41,6 @@ sed -i "1i $header" $torrents_csv_tmp mv $torrents_csv_tmp $torrents_csv +popd + echo "Pruning done." diff --git a/scripts/prune_currents.sh b/scripts/prune_currents.sh index a20e4ba..0f9096a 100755 --- a/scripts/prune_currents.sh +++ b/scripts/prune_currents.sh @@ -32,7 +32,7 @@ for f in tmp_*; do done # Remove those lines from the file -rg -vwF -f no_seeds $torrents_csv > $torrents_removed +grep -vwF -f no_seeds $torrents_csv > $torrents_removed cd .. rm $prune_currents_tmps diff --git a/scripts/scan_torrent_files.js b/scripts/scan_torrent_files.js index 9de981e..fdba2f1 100644 --- a/scripts/scan_torrent_files.js +++ b/scripts/scan_torrent_files.js @@ -1,16 +1,18 @@ -// jq -r 'to_entries[] | {hash: .key, val: .value[]} | {hash: .hash, i: .val.i, p: .val.p, l: .val.l}' torrent_files.json -// jq -r 'to_entries[] | {hash: .key, val: .value[]} | [.hash, .val.i, .val.p, .val.l] | join(";")' torrent_files.json var fs = require('fs'), path = require('path'), readTorrent = require('read-torrent'), argv = require('minimist')(process.argv.slice(2)); var torrentFiles = {}; +var torrentCsvHashes = new Set(); + var jsonFile = '../torrent_files.json'; +var torrentsCsvFile = '../torrents.csv'; main(); async function main() { await fillTorrentFiles(); + await fillTorrentCsvHashes(); await scanFolder(); writeFile(); } @@ -23,13 +25,26 @@ async function fillTorrentFiles() { } } +async function fillTorrentCsvHashes() { + var fileContents = await fs.promises.readFile(torrentsCsvFile, 'utf8'); + var lines = fileContents.split('\n'); + for (const line of lines) { + var hash = line.split(';')[0]; + torrentCsvHashes.add(hash); + } + torrentCsvHashes.delete('infohash'); +} + async function scanFolder() { console.log('Scanning dir: ' + argv.dir + '...'); var files = fs.readdirSync(argv.dir).filter(f => { var f = f.split('.'); var ext = f[1]; var hash = f[0]; - return (ext == 'torrent' && !Object.keys(torrentFiles).includes(hash)); + // It must be a torrent file, NOT already be in the files json, + // and be an infohash in the csv file. + return (ext == 'torrent' && + !Object.keys(torrentFiles).includes(hash)); }); for (const file of files) { var fullPath = argv.dir + '/' + file; @@ -41,7 +56,10 @@ async function scanFolder() { } function writeFile() { - torrentFiles = Object.keys(torrentFiles).sort().reduce((r, k) => (r[k] = torrentFiles[k], r), {}); + torrentFiles = Object.keys(torrentFiles) + .sort() + .filter(hash => torrentCsvHashes.has(hash)) + .reduce((r, k) => (r[k] = torrentFiles[k], r), {}); fs.writeFileSync(jsonFile, JSON.stringify(torrentFiles)); console.log(`${jsonFile} written.`); } diff --git a/scripts/scan_torrent_files.sh b/scripts/scan_torrent_files.sh deleted file mode 100755 index 2a04a29..0000000 --- a/scripts/scan_torrent_files.sh +++ /dev/null @@ -1,10 +0,0 @@ -torrent_files_json="`pwd`/../torrent_files.json" - -# Scan the torrent_files.json for already scanned torrents - -# Disjoint the ones there with the ones in your torrent scan dir - -# Run the js read-torrent in that dir, and update the torrent_files.json with the new ones - -node scan_torrent_files.js --dir "$1" - diff --git a/scripts/scan_torrents.sh b/scripts/scan_torrents.sh index edae0d7..a025509 100755 --- a/scripts/scan_torrents.sh +++ b/scripts/scan_torrents.sh @@ -2,10 +2,11 @@ # Checking arguments # Help line -cd .. -torrents_csv="`pwd`/torrents.csv" -scanned_out="`pwd`/infohashes_scanned.txt" -tmp_torrent_dir="`pwd`/tmp_torrents" + +torrents_csv="`pwd`/../torrents.csv" +scanned_out="`pwd`/../infohashes_scanned.txt" +tmp_torrent_dir="`pwd`/../tmp_torrents" +torrent_files_json="`pwd`/../torrent_files.json" touch $scanned_out help="Run ./scan_torrents.sh [TORRENTS_DIR] \nor goto https://gitlab.com/dessalines/torrents.csv for more help" @@ -42,7 +43,7 @@ ls|parallel -n100 mkdir {#}\;mv {} {#} for tmp_torrent_dir_sub in *; do echo "sub dir: $tmp_torrent_dir_sub" find $tmp_torrent_dir_sub -type f -exec basename {} .torrent \; > names.out - + # Delete null torrents from the temp dir find $tmp_torrent_dir_sub -name "*.torrent" -size -2k -delete @@ -68,10 +69,10 @@ for tmp_torrent_dir_sub in *; do done popd -cd scripts . prune.sh - # Remove the temp dir rm -rf "$tmp_torrent_dir" +# Scan the torrent dir for new files, and add them +node scan_torrent_files.js --dir "$torrents_dir"