From f24fa96a9e06651b34c6d74f54018b1295be6d45 Mon Sep 17 00:00:00 2001 From: Rob Watson Date: Sat, 7 Apr 2018 13:29:36 +0200 Subject: [PATCH] Add Scraper module --- config/config.exs | 3 + lib/amazon_history/cli.ex | 35 ++++++---- lib/amazon_history/scraper.ex | 114 +++++++++++++++++++++++++++++++ mix.exs | 2 + mix.lock | 14 ++++ test/amazon_history/cli_test.exs | 12 ++-- 6 files changed, 162 insertions(+), 18 deletions(-) create mode 100644 lib/amazon_history/scraper.ex create mode 100644 mix.lock diff --git a/config/config.exs b/config/config.exs index 4d6b7eb..eaf85ec 100644 --- a/config/config.exs +++ b/config/config.exs @@ -28,3 +28,6 @@ use Mix.Config # here (which is why it is important to import them last). # # import_config "#{Mix.env}.exs" + +config :logger, :console, level: :debug, device: :standard_error +config :hound, driver: "chrome_driver" diff --git a/lib/amazon_history/cli.ex b/lib/amazon_history/cli.ex index e7e7c2d..c8eb058 100644 --- a/lib/amazon_history/cli.ex +++ b/lib/amazon_history/cli.ex @@ -3,23 +3,28 @@ defmodule AmazonHistory.CLI do Handle the command-line parsing for the amazon_history tool """ + require Logger + def run(argv) do argv |> parse_args - |> process + |> scrape + |> CSV.encode + |> Enum.each(&IO.write/1) end - def process(username: username, password: password) do - IO.puts("Will continue with username: #{username}, password: #{password}") + def scrape(email: email, password: password, start_year: start_year) do + Logger.debug("Will scrape with email: #{email}, password: ****") + AmazonHistory.Scraper.fetch(email, password, start_year) end - def process(_) do - IO.puts("Usage: --username --password ") + def scrape(_) do + IO.puts(:stderr, "Usage: --email --password --start-year ") end @doc """ Options: - -u/--username: Amazon username + -e/--email: Amazon email -p/--password: Amazon password """ def parse_args(argv) do @@ -27,21 +32,27 @@ defmodule AmazonHistory.CLI do argv, switches: [ help: :boolean, - username: :string, - password: :string + email: :string, + password: :string, + start_year: :integer, ], aliases: [ h: :help, - u: :username, - p: :password + e: :email, + p: :password, + y: :start_year, ] ) |> elem(0) |> args_to_internal_representation end - defp args_to_internal_representation(username: username, password: password) do - [username: username, password: password] + defp args_to_internal_representation(email: email, password: password, start_year: start_year) do + [email: email, password: password, start_year: start_year] + end + + defp args_to_internal_representation(email: email, password: password) do + [email: email, password: password, start_year: 2000] end defp args_to_internal_representation(_) do diff --git a/lib/amazon_history/scraper.ex b/lib/amazon_history/scraper.ex new file mode 100644 index 0000000..6bc2b85 --- /dev/null +++ b/lib/amazon_history/scraper.ex @@ -0,0 +1,114 @@ +defmodule AmazonHistory.Scraper do + @base_url "https://www.amazon.co.uk" + @order_url "#{@base_url}/gp/your-account/order-history?opt=ab&digitalOrders=1&unifiedOrders=1&returnTo=&orderFilter=year-" + @end_year DateTime.utc_now.year + @orders_per_page 10 + + require Logger + use Hound.Helpers + + def fetch(email, password, start_year) do + Hound.start_session() + + login(email, password) + + Logger.info("Logged in successfully. Scraping with start_year: #{start_year}") + + items = Enum.reduce(start_year..@end_year, [], fn year, acc -> + acc ++ process_year(year, 0, []) + end) + + Hound.end_session() + + items + end + + defp login(email, password) do + navigate_to(@base_url) + + find_element(:id, "nav-link-yourAccount") + |> click + + find_element(:id, "ap_email") + |> fill_field(email) + + find_element(:id, "continue") + |> click + + find_element(:id, "ap_password") + |> fill_field(password) + + find_element(:id, "signInSubmit") + |> click + end + + defp process_year(year, startIndex, items) do + order_url(year, startIndex) + |> navigate_to + + find_element(:id, "ordersContainer") + |> find_all_within_element(:class, "order") + |> process_order_elements(year, startIndex, items) + end + + defp order_url(year, startIndex) do + @order_url <> to_string(year) <> "&startIndex=" <> to_string(startIndex) + end + + defp process_order_elements([], _year, _startIndex, items) do + items + end + + defp process_order_elements(order_elements, year, startIndex, items) do + new_items = Enum.reduce(order_elements, [], fn order_element, acc -> + acc ++ process_order_element(order_element) + end) + + process_year(year, startIndex + @orders_per_page, items ++ new_items) + end + + defp process_order_element(order_element) do + order_placed = extract_order_placed(order_element) + order_number = extract_order_number(order_element) + total = extract_total(order_element) + + find_all_within_element(order_element, :class, "a-link-normal") + |> Enum.map(&(to_row(&1, order_placed, order_number, total))) + |> Enum.filter(&is_valid_row?/1) + end + + defp extract_order_placed(order_element) do + find_within_element(order_element, :xpath, "//span[@class='a-color-secondary value']") + |> inner_text + end + + defp extract_order_number(order_element) do + find_within_element(order_element, :xpath, "(//span[@class='a-color-secondary value'])[3]") + |> inner_text + end + + defp extract_total(order_element) do + order_element + |> search_within_element(:class, "a-color-price", 1) + |> total_or_empty_string + end + + defp total_or_empty_string({:ok, element}) do + inner_text(element) + end + + defp total_or_empty_string({:error, _}) do + "" + end + + defp to_row(item_element, order_placed, order_number, total) do + name = inner_text(item_element) + url = attribute_value(item_element, :href) + + [ order_number, order_placed, name, total, url ] + end + + defp is_valid_row?([_order_number, _order_placed, name, _total, url]) do + String.trim(name) != "" && String.contains?(url, "gp/product") + end +end diff --git a/mix.exs b/mix.exs index 9375296..71d23fa 100644 --- a/mix.exs +++ b/mix.exs @@ -21,6 +21,8 @@ defmodule AmazonHistory.MixProject do # Run "mix help deps" to learn about dependencies. defp deps do [ + {:hound, "~> 1.0"}, + {:csv, "~> 2.0.0"}, # {:dep_from_hexpm, "~> 0.3.0"}, # {:dep_from_git, git: "https://github.com/elixir-lang/my_dep.git", tag: "0.1.0"}, ] diff --git a/mix.lock b/mix.lock new file mode 100644 index 0000000..6e045ed --- /dev/null +++ b/mix.lock @@ -0,0 +1,14 @@ +%{ + "certifi": {:hex, :certifi, "2.3.1", "d0f424232390bf47d82da8478022301c561cf6445b5b5fb6a84d49a9e76d2639", [:rebar3], [{:parse_trans, "3.2.0", [hex: :parse_trans, repo: "hexpm", optional: false]}], "hexpm"}, + "csv": {:hex, :csv, "2.0.0", "c66fea89ba7862b94901baf0871285e9b73cad89c5fdb57a6386d2adcf29593e", [:mix], [{:parallel_stream, "~> 1.0.4", [hex: :parallel_stream, repo: "hexpm", optional: false]}], "hexpm"}, + "hackney": {:hex, :hackney, "1.12.1", "8bf2d0e11e722e533903fe126e14d6e7e94d9b7983ced595b75f532e04b7fdc7", [:rebar3], [{:certifi, "2.3.1", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "5.1.1", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "1.0.1", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "1.0.2", [hex: :mimerl, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "1.1.1", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}], "hexpm"}, + "hound": {:hex, :hound, "1.0.4", "31db3c013f0ed321b5eb4c573bf3fbc0b74e12fc8da134f9f616527bf0906431", [:mix], [{:hackney, "~> 1.5", [hex: :hackney, repo: "hexpm", optional: false]}, {:poison, ">= 1.4.0", [hex: :poison, repo: "hexpm", optional: false]}], "hexpm"}, + "idna": {:hex, :idna, "5.1.1", "cbc3b2fa1645113267cc59c760bafa64b2ea0334635ef06dbac8801e42f7279c", [:rebar3], [{:unicode_util_compat, "0.3.1", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm"}, + "metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], [], "hexpm"}, + "mimerl": {:hex, :mimerl, "1.0.2", "993f9b0e084083405ed8252b99460c4f0563e41729ab42d9074fd5e52439be88", [:rebar3], [], "hexpm"}, + "parallel_stream": {:hex, :parallel_stream, "1.0.6", "b967be2b23f0f6787fab7ed681b4c45a215a81481fb62b01a5b750fa8f30f76c", [:mix], [], "hexpm"}, + "parse_trans": {:hex, :parse_trans, "3.2.0", "2adfa4daf80c14dc36f522cf190eb5c4ee3e28008fc6394397c16f62a26258c2", [:rebar3], [], "hexpm"}, + "poison": {:hex, :poison, "3.1.0", "d9eb636610e096f86f25d9a46f35a9facac35609a7591b3be3326e99a0484665", [:mix], [], "hexpm"}, + "ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.1", "28a4d65b7f59893bc2c7de786dec1e1555bd742d336043fe644ae956c3497fbe", [:make, :rebar], [], "hexpm"}, + "unicode_util_compat": {:hex, :unicode_util_compat, "0.3.1", "a1f612a7b512638634a603c8f401892afbf99b8ce93a45041f8aaca99cadb85e", [:rebar3], [], "hexpm"}, +} diff --git a/test/amazon_history/cli_test.exs b/test/amazon_history/cli_test.exs index a5737ec..159484d 100644 --- a/test/amazon_history/cli_test.exs +++ b/test/amazon_history/cli_test.exs @@ -9,9 +9,9 @@ defmodule CliTest do assert parse_args(["--help"]) == :help end - test ":help returned by passing only a username" do - assert parse_args(["-u", "rob"]) == :help - assert parse_args(["--username", "rob"]) == :help + test ":help returned by passing only a email" do + assert parse_args(["-e", "rob@netflux.io"]) == :help + assert parse_args(["--email", "rob@netflux.io"]) == :help end test ":help returned by passing only a password" do @@ -20,10 +20,10 @@ defmodule CliTest do end test "arguments returned by passing valid parameters" do - assert parse_args(["-u", "rob", "-p", "hackme"]) == [username: "rob", password: "hackme"] + assert parse_args(["-e", "rob@netflux.io", "-p", "hackme"]) == [email: "rob@netflux.io", password: "hackme"] - assert parse_args(["--username", "rob", "--password", "hackme"]) == [ - username: "rob", + assert parse_args(["--email", "rob@netflux.io", "--password", "hackme"]) == [ + email: "rob@netflux.io", password: "hackme" ] end