Changing to csv for torrent files as well. Fixes #60.

2019-06-25 14:28:44 -07:00 · 2019-06-25 14:28:44 -07:00 · 61523cf846
commit 61523cf846
parent 729b876102
5 changed files with 81 additions and 85 deletions
--- a/README.md
+++ b/README.md
@ -4,7 +4,7 @@
 [Demo Server](https://torrents-csv.ml)
-`Torrents.csv` is a *collaborative* repository of torrents and their files, consisting of a searchable `torrents.csv`, and `torrent_files.json`. With it you can search for torrents, or files within torrents. It aims to be a universal file system for popular data.
+`Torrents.csv` is a *collaborative* repository of torrents and their files, consisting of a searchable `torrents.csv`, and `torrent_files.csv`. With it you can search for torrents, or files within torrents. It aims to be a universal file system for popular data.
 Its initially populated with a January 2017 backup of the pirate bay, and new torrents are periodically added from various torrents sites. It comes with a self-hostable [Torrents.csv webserver](https://torrents-csv.ml), a command line search, and a folder scanner to add torrents, and their files.
@ -62,7 +62,7 @@ bleh season 1 (1993-)
 	link: magnet:?xt=urn:btih:INFO_HASH_HERE
 ```
 ## Uploading / Adding Torrents from a Directory
-An *upload*, consists of making a pull request after running the `scan_torrents.sh` script, which adds torrents from a directory you choose to the `.csv` file, after checking that they aren't already there, and that they have seeders. It also adds their files to `torrent_files.json`.
+An *upload*, consists of making a pull request after running the `scan_torrents.sh` script, which adds torrents from a directory you choose to the `.csv` file, after checking that they aren't already there, and that they have seeders. It also adds their files to `torrent_files.csv`.
 ### Requirements
 - [Torrent-Tracker-Health Dessalines branch](https://github.com/dessalines/torrent-tracker-health)
@ -108,16 +108,7 @@ infohash;name;size_bytes;created_unix;seeders;leechers;completed;scraped_date
 # torrents here...
 ```
-## How the torrent_files.json looks
+## How the torrent_files.csv looks
-```
+```sh
-{
+infohash;index;path;size_bytes
  "012ae083ec82bf911f4fe503b9f6df1effaad9ac": [
    {
      "i": 0, // the index
      "p": "File 1", // the path
      "l": 88546036A // the size in bytes
    },
    ...
  ]
 }
 ```
--- a/scripts/build_sqlite.sh
+++ b/scripts/build_sqlite.sh
@ -1,9 +1,9 @@
 #!/bin/bash
 csv_file="../torrents.csv"
-torrent_files_json="../torrent_files.json"
+torrent_files_csv="../torrent_files.csv"
 db_file="${TORRENTS_CSV_DB_FILE:-../torrents.db}"
-echo "Creating temporary torrents.db file from $csv_file ..."
+echo "Creating temporary torrents.db file..."
 # Remove double quotes for csv import
 sed 's/\"//g' $csv_file > torrents_removed_quotes.csv
@ -31,18 +31,15 @@ UPDATE torrents SET completed=NULL WHERE completed = '';
 EOF
 rm torrents_removed_quotes.csv
-# Cache torrent files if they exist
+# Cache torrent files
-if [ -f $torrent_files_json ]; then
+echo "Building files DB from $torrent_files_csv ..."
  echo "Building files DB from $torrent_files_json ..."
-  # Old way, doesn't work with too much ram
+# Remove double quotes for csv import
-  # jq -r 'to_entries[] | {hash: .key, val: .value[]} | [.hash, .val.i, .val.p, .val.l] | join(";")' $torrent_files_json > torrent_files_temp
+sed 's/\"//g' $torrent_files_csv > torrent_files_removed_quotes.csv
  # New way, credit to ogusismail : https://stackoverflow.com/a/55600294/1655478
  jq --stream -n -r 'foreach inputs as $pv ([[],[]]; if ($pv|length) == 2 then (.[0] |= if . == [] then . + [$pv[0][0],$pv[1]] else . + [$pv[1]] end) else [[],.[0]] end; if .[0] == [] and .[1] != [] then .[1] else empty end) | join(";")' $torrent_files_json > torrent_files_temp
 # Removing those with too many ;
-rg "^([^;]*;){3}[^;]+$" torrent_files_temp > torrent_files_temp_2
+rg "^([^;]*;){3}[^;]+$" torrent_files_removed_quotes.csv > torrent_files_temp_2
 rm torrent_files_removed_quotes.csv
 mv torrent_files_temp_2 torrent_files_temp
 sqlite3 -batch db_tmp<<EOF
@ -84,9 +81,7 @@ delete from files where seeders is null;
 drop table files_tmp;
 EOF
 rm torrent_files_temp
  fi
-  mv db_tmp $db_file
+mv db_tmp $db_file
  echo "Done."
 echo "Done."
--- a/scripts/prune.sh
+++ b/scripts/prune.sh
@ -5,6 +5,9 @@ torrents_csv="`pwd`/torrents.csv"
 torrents_csv_tmp="`pwd`/torrents_prune_tmp.csv"
 scanned_out="`pwd`/infohashes_scanned.txt"
 torrent_files_csv="`pwd`/torrent_files.csv"
 torrent_files_csv_tmp="`pwd`/torrent_files_tmp.csv"
 cp $torrents_csv $torrents_csv_tmp
 # Remove lines that don't have exactly 7 ';'
@ -39,6 +42,22 @@ sed  -i "1i $header" $torrents_csv_tmp
 mv $torrents_csv_tmp $torrents_csv
 # Torrent files cleanup
 echo "Pruning torrent_files.csv ..."
 cp $torrent_files_csv $torrent_files_csv_tmp
 # Header
 header=$(head -n1 $torrent_files_csv_tmp)
 sed -i '1d' $torrent_files_csv_tmp
 # Same for the infohashes scanned
 sort --field-separator=';' --key=1,2 -o $torrent_files_csv_tmp $torrent_files_csv_tmp
 # Add the header back in
 sed  -i "1i $header" $torrent_files_csv_tmp
 mv $torrent_files_csv_tmp $torrent_files_csv
 popd
 echo "Pruning done."
--- a/scripts/scan_torrent_files.js
+++ b/scripts/scan_torrent_files.js
@ -1,44 +1,38 @@
 const fs = require('fs'),
  path = require('path'),
  readTorrent = require('read-torrent'),
-  argv = require('minimist')(process.argv.slice(2));
+  argv = require('minimist')(process.argv.slice(2)),
  readline = require('readline');
-var torrentFiles = {};
+var scannedCsvHashes = new Set();
 var torrentCsvHashes = new Set();
-var jsonFile = '../torrent_files.json';
+var torrentFilesCsv = '../torrent_files.csv';
-var torrentsCsvFile = '../torrents.csv';
+console.log(`Scanning torrent files from ${argv.dir} into ${torrentFilesCsv} ...`);
 console.log(`Scanning torrent files from ${argv.dir} into ${jsonFile} ...`);
 main();
 async function main() {
-  await fillTorrentFiles();
+  await fillScannedHashes();
-  await fillTorrentCsvHashes();
+  scanFolder();
  await scanFolder();
  writeFile();
 }
 async function fillScannedHashes() {
  console.log(`Filling CSV hashes...`);
  const fileStream = fs.createReadStream(torrentFilesCsv);
-async function fillTorrentFiles() {
+  const rl = readline.createInterface({
-  if (fs.existsSync(jsonFile)) {
+    input: fileStream,
-    var fileContents = await fs.promises.readFile(jsonFile, 'utf8');
+    crlfDelay: Infinity
-    torrentFiles = JSON.parse(fileContents);
+  });
  }
 }
-async function fillTorrentCsvHashes() {
+  for await (const line of rl) {
  var fileContents = await fs.promises.readFile(torrentsCsvFile, 'utf8');
  var lines = fileContents.split('\n');
  for (const line of lines) {
    var hash = line.split(';')[0];
-    torrentCsvHashes.add(hash);
+    scannedCsvHashes.add(hash);
  }
-  torrentCsvHashes.delete('infohash');
+
  scannedCsvHashes.delete('infohash');
 }
 async function scanFolder() {
  console.log('Scanning dir: ' + argv.dir + '...');
  var fileHashes = new Set(Object.keys(torrentFiles));
  var files = fs.readdirSync(argv.dir).filter(f => {
    var sp = f.split('.');
@ -46,44 +40,41 @@ async function scanFolder() {
    var hash = sp[0];
    var fullPath = argv.dir + '/' + f;
    // It must be a torrent file,
-    // NOT in the torrent_files.json
+    // must not be in the CSV file
    // must be in the CSV file
    // must have a file size
    // must be in infohash format length
    return (ext == 'torrent' &&
-      !fileHashes.has(hash) &&
+      !scannedCsvHashes.has(hash) &&
-      torrentCsvHashes.has(hash) &&
+      getFilesizeInBytes(fullPath) > 0) &&
-      getFilesizeInBytes(fullPath) > 0);
+      hash.length == 40;
  });
-  for (const file of files) {
+
  for (file of files) {
    var fullPath = argv.dir + '/' + file;
    console.log(`Scanning File ${fullPath}`);
-    var torrent = await read(fullPath).catch(e => console.log('Read error'));
+    var torrent = await read(fullPath).catch(e => console.log(e));
-    torrentFiles = { ...torrentFiles, ...torrent }; // concat them
+    await writeFile(torrent);
-  };
+  }
-  console.log('Done scanning.');
+  console.log('Done.');
 }
 function writeFile(torrent) {
  for (const infohash in torrent) {
    let files = torrent[infohash];
    for (const file of files) {
      let csvRow = `${infohash};${file.i};${file.p};${file.l}\n`;
      fs.appendFile(torrentFilesCsv, csvRow, function (err) {
        if (err) throw err;
      });
    }
  }
 }
 function getFilesizeInBytes(filename) {
-    var stats = fs.statSync(filename);
+  var stats = fs.statSync(filename);
-    var fileSizeInBytes = stats["size"];
+  var fileSizeInBytes = stats["size"];
-    return fileSizeInBytes;
+  return fileSizeInBytes;
 }
 function writeFile() {
  torrentFiles = Object.keys(torrentFiles)
    .sort()
    .filter(hash => torrentCsvHashes.has(hash))
    .reduce((r, k) => (r[k] = torrentFiles[k], r), {});
  fs.writeFileSync(jsonFile, "{\n");
  var first = true;
  for (let [key, value] of Object.entries(torrentFiles)) {
    if(first) first = false;
    else fs.appendFileSync(jsonFile, ",\n");
    fs.appendFileSync(jsonFile, `${JSON.stringify(key)}:${JSON.stringify(value)}`);
  }
  fs.appendFileSync(jsonFile, "\n}");
  console.log(`${jsonFile} written.`);
  process.exit();
 }
 function read(uri, options) {
--- a/scripts/scan_torrents.sh
+++ b/scripts/scan_torrents.sh
@ -6,7 +6,7 @@
 torrents_csv="`pwd`/../torrents.csv"
 scanned_out="`pwd`/../infohashes_scanned.txt"
 tmp_torrent_dir="`pwd`/../tmp_torrents-$RANDOM"
-torrent_files_json="`pwd`/../torrent_files.json"
+torrent_files_csv="`pwd`/../torrent_files.csv"
 touch $scanned_out
 help="Run ./scan_torrents.sh [TORRENTS_DIR] \nor goto https://gitlab.com/dessalines/torrents.csv for more help"
@ -76,5 +76,5 @@ popd
 rm -rf "$tmp_torrent_dir"
 # Scan the torrent dir for new files, and add them
-node --max-old-space-size=4096 scan_torrent_files.js --dir "$torrents_dir"
+node --max-old-space-size=8096 scan_torrent_files.js --dir "$torrents_dir"