Add Scraper module

This commit is contained in:
Rob Watson 2018-04-07 13:29:36 +02:00
parent e0330fdec0
commit f24fa96a9e
6 changed files with 162 additions and 18 deletions

View File

@ -28,3 +28,6 @@ use Mix.Config
# here (which is why it is important to import them last). # here (which is why it is important to import them last).
# #
# import_config "#{Mix.env}.exs" # import_config "#{Mix.env}.exs"
config :logger, :console, level: :debug, device: :standard_error
config :hound, driver: "chrome_driver"

View File

@ -3,23 +3,28 @@ defmodule AmazonHistory.CLI do
Handle the command-line parsing for the amazon_history tool Handle the command-line parsing for the amazon_history tool
""" """
require Logger
def run(argv) do def run(argv) do
argv argv
|> parse_args |> parse_args
|> process |> scrape
|> CSV.encode
|> Enum.each(&IO.write/1)
end end
def process(username: username, password: password) do def scrape(email: email, password: password, start_year: start_year) do
IO.puts("Will continue with username: #{username}, password: #{password}") Logger.debug("Will scrape with email: #{email}, password: ****")
AmazonHistory.Scraper.fetch(email, password, start_year)
end end
def process(_) do def scrape(_) do
IO.puts("Usage: --username <username> --password <password>") IO.puts(:stderr, "Usage: --email <email> --password <password> --start-year <start year>")
end end
@doc """ @doc """
Options: Options:
-u/--username: Amazon username -e/--email: Amazon email
-p/--password: Amazon password -p/--password: Amazon password
""" """
def parse_args(argv) do def parse_args(argv) do
@ -27,21 +32,27 @@ defmodule AmazonHistory.CLI do
argv, argv,
switches: [ switches: [
help: :boolean, help: :boolean,
username: :string, email: :string,
password: :string password: :string,
start_year: :integer,
], ],
aliases: [ aliases: [
h: :help, h: :help,
u: :username, e: :email,
p: :password p: :password,
y: :start_year,
] ]
) )
|> elem(0) |> elem(0)
|> args_to_internal_representation |> args_to_internal_representation
end end
defp args_to_internal_representation(username: username, password: password) do defp args_to_internal_representation(email: email, password: password, start_year: start_year) do
[username: username, password: password] [email: email, password: password, start_year: start_year]
end
defp args_to_internal_representation(email: email, password: password) do
[email: email, password: password, start_year: 2000]
end end
defp args_to_internal_representation(_) do defp args_to_internal_representation(_) do

View File

@ -0,0 +1,114 @@
defmodule AmazonHistory.Scraper do
@base_url "https://www.amazon.co.uk"
@order_url "#{@base_url}/gp/your-account/order-history?opt=ab&digitalOrders=1&unifiedOrders=1&returnTo=&orderFilter=year-"
@end_year DateTime.utc_now.year
@orders_per_page 10
require Logger
use Hound.Helpers
def fetch(email, password, start_year) do
Hound.start_session()
login(email, password)
Logger.info("Logged in successfully. Scraping with start_year: #{start_year}")
items = Enum.reduce(start_year..@end_year, [], fn year, acc ->
acc ++ process_year(year, 0, [])
end)
Hound.end_session()
items
end
defp login(email, password) do
navigate_to(@base_url)
find_element(:id, "nav-link-yourAccount")
|> click
find_element(:id, "ap_email")
|> fill_field(email)
find_element(:id, "continue")
|> click
find_element(:id, "ap_password")
|> fill_field(password)
find_element(:id, "signInSubmit")
|> click
end
defp process_year(year, startIndex, items) do
order_url(year, startIndex)
|> navigate_to
find_element(:id, "ordersContainer")
|> find_all_within_element(:class, "order")
|> process_order_elements(year, startIndex, items)
end
defp order_url(year, startIndex) do
@order_url <> to_string(year) <> "&startIndex=" <> to_string(startIndex)
end
defp process_order_elements([], _year, _startIndex, items) do
items
end
defp process_order_elements(order_elements, year, startIndex, items) do
new_items = Enum.reduce(order_elements, [], fn order_element, acc ->
acc ++ process_order_element(order_element)
end)
process_year(year, startIndex + @orders_per_page, items ++ new_items)
end
defp process_order_element(order_element) do
order_placed = extract_order_placed(order_element)
order_number = extract_order_number(order_element)
total = extract_total(order_element)
find_all_within_element(order_element, :class, "a-link-normal")
|> Enum.map(&(to_row(&1, order_placed, order_number, total)))
|> Enum.filter(&is_valid_row?/1)
end
defp extract_order_placed(order_element) do
find_within_element(order_element, :xpath, "//span[@class='a-color-secondary value']")
|> inner_text
end
defp extract_order_number(order_element) do
find_within_element(order_element, :xpath, "(//span[@class='a-color-secondary value'])[3]")
|> inner_text
end
defp extract_total(order_element) do
order_element
|> search_within_element(:class, "a-color-price", 1)
|> total_or_empty_string
end
defp total_or_empty_string({:ok, element}) do
inner_text(element)
end
defp total_or_empty_string({:error, _}) do
""
end
defp to_row(item_element, order_placed, order_number, total) do
name = inner_text(item_element)
url = attribute_value(item_element, :href)
[ order_number, order_placed, name, total, url ]
end
defp is_valid_row?([_order_number, _order_placed, name, _total, url]) do
String.trim(name) != "" && String.contains?(url, "gp/product")
end
end

View File

@ -21,6 +21,8 @@ defmodule AmazonHistory.MixProject do
# Run "mix help deps" to learn about dependencies. # Run "mix help deps" to learn about dependencies.
defp deps do defp deps do
[ [
{:hound, "~> 1.0"},
{:csv, "~> 2.0.0"},
# {:dep_from_hexpm, "~> 0.3.0"}, # {:dep_from_hexpm, "~> 0.3.0"},
# {:dep_from_git, git: "https://github.com/elixir-lang/my_dep.git", tag: "0.1.0"}, # {:dep_from_git, git: "https://github.com/elixir-lang/my_dep.git", tag: "0.1.0"},
] ]

14
mix.lock Normal file
View File

@ -0,0 +1,14 @@
%{
"certifi": {:hex, :certifi, "2.3.1", "d0f424232390bf47d82da8478022301c561cf6445b5b5fb6a84d49a9e76d2639", [:rebar3], [{:parse_trans, "3.2.0", [hex: :parse_trans, repo: "hexpm", optional: false]}], "hexpm"},
"csv": {:hex, :csv, "2.0.0", "c66fea89ba7862b94901baf0871285e9b73cad89c5fdb57a6386d2adcf29593e", [:mix], [{:parallel_stream, "~> 1.0.4", [hex: :parallel_stream, repo: "hexpm", optional: false]}], "hexpm"},
"hackney": {:hex, :hackney, "1.12.1", "8bf2d0e11e722e533903fe126e14d6e7e94d9b7983ced595b75f532e04b7fdc7", [:rebar3], [{:certifi, "2.3.1", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "5.1.1", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "1.0.1", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "1.0.2", [hex: :mimerl, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "1.1.1", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}], "hexpm"},
"hound": {:hex, :hound, "1.0.4", "31db3c013f0ed321b5eb4c573bf3fbc0b74e12fc8da134f9f616527bf0906431", [:mix], [{:hackney, "~> 1.5", [hex: :hackney, repo: "hexpm", optional: false]}, {:poison, ">= 1.4.0", [hex: :poison, repo: "hexpm", optional: false]}], "hexpm"},
"idna": {:hex, :idna, "5.1.1", "cbc3b2fa1645113267cc59c760bafa64b2ea0334635ef06dbac8801e42f7279c", [:rebar3], [{:unicode_util_compat, "0.3.1", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm"},
"metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], [], "hexpm"},
"mimerl": {:hex, :mimerl, "1.0.2", "993f9b0e084083405ed8252b99460c4f0563e41729ab42d9074fd5e52439be88", [:rebar3], [], "hexpm"},
"parallel_stream": {:hex, :parallel_stream, "1.0.6", "b967be2b23f0f6787fab7ed681b4c45a215a81481fb62b01a5b750fa8f30f76c", [:mix], [], "hexpm"},
"parse_trans": {:hex, :parse_trans, "3.2.0", "2adfa4daf80c14dc36f522cf190eb5c4ee3e28008fc6394397c16f62a26258c2", [:rebar3], [], "hexpm"},
"poison": {:hex, :poison, "3.1.0", "d9eb636610e096f86f25d9a46f35a9facac35609a7591b3be3326e99a0484665", [:mix], [], "hexpm"},
"ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.1", "28a4d65b7f59893bc2c7de786dec1e1555bd742d336043fe644ae956c3497fbe", [:make, :rebar], [], "hexpm"},
"unicode_util_compat": {:hex, :unicode_util_compat, "0.3.1", "a1f612a7b512638634a603c8f401892afbf99b8ce93a45041f8aaca99cadb85e", [:rebar3], [], "hexpm"},
}

View File

@ -9,9 +9,9 @@ defmodule CliTest do
assert parse_args(["--help"]) == :help assert parse_args(["--help"]) == :help
end end
test ":help returned by passing only a username" do test ":help returned by passing only a email" do
assert parse_args(["-u", "rob"]) == :help assert parse_args(["-e", "rob@netflux.io"]) == :help
assert parse_args(["--username", "rob"]) == :help assert parse_args(["--email", "rob@netflux.io"]) == :help
end end
test ":help returned by passing only a password" do test ":help returned by passing only a password" do
@ -20,10 +20,10 @@ defmodule CliTest do
end end
test "arguments returned by passing valid parameters" do test "arguments returned by passing valid parameters" do
assert parse_args(["-u", "rob", "-p", "hackme"]) == [username: "rob", password: "hackme"] assert parse_args(["-e", "rob@netflux.io", "-p", "hackme"]) == [email: "rob@netflux.io", password: "hackme"]
assert parse_args(["--username", "rob", "--password", "hackme"]) == [ assert parse_args(["--email", "rob@netflux.io", "--password", "hackme"]) == [
username: "rob", email: "rob@netflux.io",
password: "hackme" password: "hackme"
] ]
end end