Changing to csv for torrent files as well. Fixes #60.

2019-06-25 14:28:44 -07:00 · 2019-06-25 14:28:44 -07:00 · 61523cf846
commit 61523cf846
parent 729b876102
5 changed files with 81 additions and 85 deletions
--- a/README.md
+++ b/README.md
@ -4,7 +4,7 @@

 [Demo Server](https://torrents-csv.ml)

-`Torrents.csv` is a *collaborative* repository of torrents and their files, consisting of a searchable `torrents.csv`, and `torrent_files.json`. With it you can search for torrents, or files within torrents. It aims to be a universal file system for popular data.
+`Torrents.csv` is a *collaborative* repository of torrents and their files, consisting of a searchable `torrents.csv`, and `torrent_files.csv`. With it you can search for torrents, or files within torrents. It aims to be a universal file system for popular data.

 Its initially populated with a January 2017 backup of the pirate bay, and new torrents are periodically added from various torrents sites. It comes with a self-hostable [Torrents.csv webserver](https://torrents-csv.ml), a command line search, and a folder scanner to add torrents, and their files.

@ -62,7 +62,7 @@ bleh season 1 (1993-)
 	link: magnet:?xt=urn:btih:INFO_HASH_HERE
 ```
 ## Uploading / Adding Torrents from a Directory
-An *upload*, consists of making a pull request after running the `scan_torrents.sh` script, which adds torrents from a directory you choose to the `.csv` file, after checking that they aren't already there, and that they have seeders. It also adds their files to `torrent_files.json`.
+An *upload*, consists of making a pull request after running the `scan_torrents.sh` script, which adds torrents from a directory you choose to the `.csv` file, after checking that they aren't already there, and that they have seeders. It also adds their files to `torrent_files.csv`.

 ### Requirements
 - [Torrent-Tracker-Health Dessalines branch](https://github.com/dessalines/torrent-tracker-health)
@ -108,16 +108,7 @@ infohash;name;size_bytes;created_unix;seeders;leechers;completed;scraped_date
 # torrents here...
 ```

-## How the torrent_files.json looks
-```
-{
-  "012ae083ec82bf911f4fe503b9f6df1effaad9ac": [
-    {
-      "i": 0, // the index
-      "p": "File 1", // the path
-      "l": 88546036A // the size in bytes
-    },
-    ...
-  ]
-}
+## How the torrent_files.csv looks
+```sh
+infohash;index;path;size_bytes
 ```
--- a/scripts/build_sqlite.sh
+++ b/scripts/build_sqlite.sh
@ -1,9 +1,9 @@
 #!/bin/bash
 csv_file="../torrents.csv"
-torrent_files_json="../torrent_files.json"
+torrent_files_csv="../torrent_files.csv"
 db_file="${TORRENTS_CSV_DB_FILE:-../torrents.db}"

-echo "Creating temporary torrents.db file from $csv_file ..."
+echo "Creating temporary torrents.db file..."

 # Remove double quotes for csv import
 sed 's/\"//g' $csv_file > torrents_removed_quotes.csv
@ -31,18 +31,15 @@ UPDATE torrents SET completed=NULL WHERE completed = '';
 EOF
 rm torrents_removed_quotes.csv

-# Cache torrent files if they exist
-if [ -f $torrent_files_json ]; then
-  echo "Building files DB from $torrent_files_json ..."
+# Cache torrent files
+echo "Building files DB from $torrent_files_csv ..."

-  # Old way, doesn't work with too much ram
-  # jq -r 'to_entries[] | {hash: .key, val: .value[]} | [.hash, .val.i, .val.p, .val.l] | join(";")' $torrent_files_json > torrent_files_temp
-
-  # New way, credit to ogusismail : https://stackoverflow.com/a/55600294/1655478
-  jq --stream -n -r 'foreach inputs as $pv ([[],[]]; if ($pv|length) == 2 then (.[0] |= if . == [] then . + [$pv[0][0],$pv[1]] else . + [$pv[1]] end) else [[],.[0]] end; if .[0] == [] and .[1] != [] then .[1] else empty end) | join(";")' $torrent_files_json > torrent_files_temp
+# Remove double quotes for csv import
+sed 's/\"//g' $torrent_files_csv > torrent_files_removed_quotes.csv

 # Removing those with too many ;
-rg "^([^;]*;){3}[^;]+$" torrent_files_temp > torrent_files_temp_2
+rg "^([^;]*;){3}[^;]+$" torrent_files_removed_quotes.csv > torrent_files_temp_2
+rm torrent_files_removed_quotes.csv
 mv torrent_files_temp_2 torrent_files_temp

 sqlite3 -batch db_tmp<<EOF
@ -84,9 +81,7 @@ delete from files where seeders is null;
 drop table files_tmp;
 EOF
 rm torrent_files_temp
-  fi

 mv db_tmp $db_file

 echo "Done."
-
--- a/scripts/prune.sh
+++ b/scripts/prune.sh
@ -5,6 +5,9 @@ torrents_csv="`pwd`/torrents.csv"
 torrents_csv_tmp="`pwd`/torrents_prune_tmp.csv"
 scanned_out="`pwd`/infohashes_scanned.txt"

+torrent_files_csv="`pwd`/torrent_files.csv"
+torrent_files_csv_tmp="`pwd`/torrent_files_tmp.csv"
+
 cp $torrents_csv $torrents_csv_tmp

 # Remove lines that don't have exactly 7 ';'
@ -39,6 +42,22 @@ sed  -i "1i $header" $torrents_csv_tmp

 mv $torrents_csv_tmp $torrents_csv

+# Torrent files cleanup
+echo "Pruning torrent_files.csv ..."
+cp $torrent_files_csv $torrent_files_csv_tmp
+
+# Header
+header=$(head -n1 $torrent_files_csv_tmp)
+sed -i '1d' $torrent_files_csv_tmp
+
+# Same for the infohashes scanned
+sort --field-separator=';' --key=1,2 -o $torrent_files_csv_tmp $torrent_files_csv_tmp
+
+# Add the header back in
+sed  -i "1i $header" $torrent_files_csv_tmp
+
+mv $torrent_files_csv_tmp $torrent_files_csv
+
 popd

 echo "Pruning done."
--- a/scripts/scan_torrent_files.js
+++ b/scripts/scan_torrent_files.js
@ -1,44 +1,38 @@
 const fs = require('fs'),
-  path = require('path'),
  readTorrent = require('read-torrent'),
-  argv = require('minimist')(process.argv.slice(2));
+  argv = require('minimist')(process.argv.slice(2)),
+  readline = require('readline');

-var torrentFiles = {};
-var torrentCsvHashes = new Set();
+var scannedCsvHashes = new Set();

-var jsonFile = '../torrent_files.json';
-var torrentsCsvFile = '../torrents.csv';
-console.log(`Scanning torrent files from ${argv.dir} into ${jsonFile} ...`);
+var torrentFilesCsv = '../torrent_files.csv';
+console.log(`Scanning torrent files from ${argv.dir} into ${torrentFilesCsv} ...`);
 main();

 async function main() {
-  await fillTorrentFiles();
-  await fillTorrentCsvHashes();
-  await scanFolder();
-  writeFile();
+  await fillScannedHashes();
+  scanFolder();
 }

+async function fillScannedHashes() {
+  console.log(`Filling CSV hashes...`);
+  const fileStream = fs.createReadStream(torrentFilesCsv);

-async function fillTorrentFiles() {
-  if (fs.existsSync(jsonFile)) {
-    var fileContents = await fs.promises.readFile(jsonFile, 'utf8');
-    torrentFiles = JSON.parse(fileContents);
-  }
-}
+  const rl = readline.createInterface({
+    input: fileStream,
+    crlfDelay: Infinity
+  });

-async function fillTorrentCsvHashes() {
-  var fileContents = await fs.promises.readFile(torrentsCsvFile, 'utf8');
-  var lines = fileContents.split('\n');
-  for (const line of lines) {
+  for await (const line of rl) {
    var hash = line.split(';')[0];
-    torrentCsvHashes.add(hash);
+    scannedCsvHashes.add(hash);
  }
-  torrentCsvHashes.delete('infohash');
+
+  scannedCsvHashes.delete('infohash');
 }

 async function scanFolder() {
  console.log('Scanning dir: ' + argv.dir + '...');
-  var fileHashes = new Set(Object.keys(torrentFiles));

  var files = fs.readdirSync(argv.dir).filter(f => {
    var sp = f.split('.');
@ -46,21 +40,35 @@ async function scanFolder() {
    var hash = sp[0];
    var fullPath = argv.dir + '/' + f;
    // It must be a torrent file,
-    // NOT in the torrent_files.json
-    // must be in the CSV file
+    // must not be in the CSV file
    // must have a file size
+    // must be in infohash format length
    return (ext == 'torrent' &&
-      !fileHashes.has(hash) &&
-      torrentCsvHashes.has(hash) &&
-      getFilesizeInBytes(fullPath) > 0);
+      !scannedCsvHashes.has(hash) &&
+      getFilesizeInBytes(fullPath) > 0) &&
+      hash.length == 40;
  });
-  for (const file of files) {
+
+  for (file of files) {
    var fullPath = argv.dir + '/' + file;
    console.log(`Scanning File ${fullPath}`);
-    var torrent = await read(fullPath).catch(e => console.log('Read error'));
-    torrentFiles = { ...torrentFiles, ...torrent }; // concat them
-  };
-  console.log('Done scanning.');
+    var torrent = await read(fullPath).catch(e => console.log(e));
+    await writeFile(torrent);
+  }
+  console.log('Done.');
+}
+
+function writeFile(torrent) {
+  for (const infohash in torrent) {
+    let files = torrent[infohash];
+    for (const file of files) {
+      let csvRow = `${infohash};${file.i};${file.p};${file.l}\n`;
+      fs.appendFile(torrentFilesCsv, csvRow, function (err) {
+        if (err) throw err;
+      });
+
+    }
+  }
 }

 function getFilesizeInBytes(filename) {
@ -69,23 +77,6 @@ function getFilesizeInBytes(filename) {
  return fileSizeInBytes;
 }

-function writeFile() {
-  torrentFiles = Object.keys(torrentFiles)
-    .sort()
-    .filter(hash => torrentCsvHashes.has(hash))
-    .reduce((r, k) => (r[k] = torrentFiles[k], r), {});
-  fs.writeFileSync(jsonFile, "{\n");
-  var first = true;
-  for (let [key, value] of Object.entries(torrentFiles)) {
-    if(first) first = false;
-    else fs.appendFileSync(jsonFile, ",\n");
-    fs.appendFileSync(jsonFile, `${JSON.stringify(key)}:${JSON.stringify(value)}`);
-  }
-  fs.appendFileSync(jsonFile, "\n}");
-  console.log(`${jsonFile} written.`);
-  process.exit();
-}
-
 function read(uri, options) {
  return new Promise((resolve, reject) => {
    readTorrent(uri, (err, info) => {
--- a/scripts/scan_torrents.sh
+++ b/scripts/scan_torrents.sh
@ -6,7 +6,7 @@
 torrents_csv="`pwd`/../torrents.csv"
 scanned_out="`pwd`/../infohashes_scanned.txt"
 tmp_torrent_dir="`pwd`/../tmp_torrents-$RANDOM"
-torrent_files_json="`pwd`/../torrent_files.json"
+torrent_files_csv="`pwd`/../torrent_files.csv"
 touch $scanned_out

 help="Run ./scan_torrents.sh [TORRENTS_DIR] \nor goto https://gitlab.com/dessalines/torrents.csv for more help"
@ -76,5 +76,5 @@ popd
 rm -rf "$tmp_torrent_dir"

 # Scan the torrent dir for new files, and add them
-node --max-old-space-size=4096 scan_torrent_files.js --dir "$torrents_dir"
+node --max-old-space-size=8096 scan_torrent_files.js --dir "$torrents_dir"