Add Scraper module
This commit is contained in:
parent
e0330fdec0
commit
f24fa96a9e
|
@ -28,3 +28,6 @@ use Mix.Config
|
|||
# here (which is why it is important to import them last).
|
||||
#
|
||||
# import_config "#{Mix.env}.exs"
|
||||
|
||||
config :logger, :console, level: :debug, device: :standard_error
|
||||
config :hound, driver: "chrome_driver"
|
||||
|
|
|
@ -3,23 +3,28 @@ defmodule AmazonHistory.CLI do
|
|||
Handle the command-line parsing for the amazon_history tool
|
||||
"""
|
||||
|
||||
require Logger
|
||||
|
||||
def run(argv) do
|
||||
argv
|
||||
|> parse_args
|
||||
|> process
|
||||
|> scrape
|
||||
|> CSV.encode
|
||||
|> Enum.each(&IO.write/1)
|
||||
end
|
||||
|
||||
def process(username: username, password: password) do
|
||||
IO.puts("Will continue with username: #{username}, password: #{password}")
|
||||
def scrape(email: email, password: password, start_year: start_year) do
|
||||
Logger.debug("Will scrape with email: #{email}, password: ****")
|
||||
AmazonHistory.Scraper.fetch(email, password, start_year)
|
||||
end
|
||||
|
||||
def process(_) do
|
||||
IO.puts("Usage: --username <username> --password <password>")
|
||||
def scrape(_) do
|
||||
IO.puts(:stderr, "Usage: --email <email> --password <password> --start-year <start year>")
|
||||
end
|
||||
|
||||
@doc """
|
||||
Options:
|
||||
-u/--username: Amazon username
|
||||
-e/--email: Amazon email
|
||||
-p/--password: Amazon password
|
||||
"""
|
||||
def parse_args(argv) do
|
||||
|
@ -27,21 +32,27 @@ defmodule AmazonHistory.CLI do
|
|||
argv,
|
||||
switches: [
|
||||
help: :boolean,
|
||||
username: :string,
|
||||
password: :string
|
||||
email: :string,
|
||||
password: :string,
|
||||
start_year: :integer,
|
||||
],
|
||||
aliases: [
|
||||
h: :help,
|
||||
u: :username,
|
||||
p: :password
|
||||
e: :email,
|
||||
p: :password,
|
||||
y: :start_year,
|
||||
]
|
||||
)
|
||||
|> elem(0)
|
||||
|> args_to_internal_representation
|
||||
end
|
||||
|
||||
defp args_to_internal_representation(username: username, password: password) do
|
||||
[username: username, password: password]
|
||||
defp args_to_internal_representation(email: email, password: password, start_year: start_year) do
|
||||
[email: email, password: password, start_year: start_year]
|
||||
end
|
||||
|
||||
defp args_to_internal_representation(email: email, password: password) do
|
||||
[email: email, password: password, start_year: 2000]
|
||||
end
|
||||
|
||||
defp args_to_internal_representation(_) do
|
||||
|
|
|
@ -0,0 +1,114 @@
|
|||
defmodule AmazonHistory.Scraper do
|
||||
@base_url "https://www.amazon.co.uk"
|
||||
@order_url "#{@base_url}/gp/your-account/order-history?opt=ab&digitalOrders=1&unifiedOrders=1&returnTo=&orderFilter=year-"
|
||||
@end_year DateTime.utc_now.year
|
||||
@orders_per_page 10
|
||||
|
||||
require Logger
|
||||
use Hound.Helpers
|
||||
|
||||
def fetch(email, password, start_year) do
|
||||
Hound.start_session()
|
||||
|
||||
login(email, password)
|
||||
|
||||
Logger.info("Logged in successfully. Scraping with start_year: #{start_year}")
|
||||
|
||||
items = Enum.reduce(start_year..@end_year, [], fn year, acc ->
|
||||
acc ++ process_year(year, 0, [])
|
||||
end)
|
||||
|
||||
Hound.end_session()
|
||||
|
||||
items
|
||||
end
|
||||
|
||||
defp login(email, password) do
|
||||
navigate_to(@base_url)
|
||||
|
||||
find_element(:id, "nav-link-yourAccount")
|
||||
|> click
|
||||
|
||||
find_element(:id, "ap_email")
|
||||
|> fill_field(email)
|
||||
|
||||
find_element(:id, "continue")
|
||||
|> click
|
||||
|
||||
find_element(:id, "ap_password")
|
||||
|> fill_field(password)
|
||||
|
||||
find_element(:id, "signInSubmit")
|
||||
|> click
|
||||
end
|
||||
|
||||
defp process_year(year, startIndex, items) do
|
||||
order_url(year, startIndex)
|
||||
|> navigate_to
|
||||
|
||||
find_element(:id, "ordersContainer")
|
||||
|> find_all_within_element(:class, "order")
|
||||
|> process_order_elements(year, startIndex, items)
|
||||
end
|
||||
|
||||
defp order_url(year, startIndex) do
|
||||
@order_url <> to_string(year) <> "&startIndex=" <> to_string(startIndex)
|
||||
end
|
||||
|
||||
defp process_order_elements([], _year, _startIndex, items) do
|
||||
items
|
||||
end
|
||||
|
||||
defp process_order_elements(order_elements, year, startIndex, items) do
|
||||
new_items = Enum.reduce(order_elements, [], fn order_element, acc ->
|
||||
acc ++ process_order_element(order_element)
|
||||
end)
|
||||
|
||||
process_year(year, startIndex + @orders_per_page, items ++ new_items)
|
||||
end
|
||||
|
||||
defp process_order_element(order_element) do
|
||||
order_placed = extract_order_placed(order_element)
|
||||
order_number = extract_order_number(order_element)
|
||||
total = extract_total(order_element)
|
||||
|
||||
find_all_within_element(order_element, :class, "a-link-normal")
|
||||
|> Enum.map(&(to_row(&1, order_placed, order_number, total)))
|
||||
|> Enum.filter(&is_valid_row?/1)
|
||||
end
|
||||
|
||||
defp extract_order_placed(order_element) do
|
||||
find_within_element(order_element, :xpath, "//span[@class='a-color-secondary value']")
|
||||
|> inner_text
|
||||
end
|
||||
|
||||
defp extract_order_number(order_element) do
|
||||
find_within_element(order_element, :xpath, "(//span[@class='a-color-secondary value'])[3]")
|
||||
|> inner_text
|
||||
end
|
||||
|
||||
defp extract_total(order_element) do
|
||||
order_element
|
||||
|> search_within_element(:class, "a-color-price", 1)
|
||||
|> total_or_empty_string
|
||||
end
|
||||
|
||||
defp total_or_empty_string({:ok, element}) do
|
||||
inner_text(element)
|
||||
end
|
||||
|
||||
defp total_or_empty_string({:error, _}) do
|
||||
""
|
||||
end
|
||||
|
||||
defp to_row(item_element, order_placed, order_number, total) do
|
||||
name = inner_text(item_element)
|
||||
url = attribute_value(item_element, :href)
|
||||
|
||||
[ order_number, order_placed, name, total, url ]
|
||||
end
|
||||
|
||||
defp is_valid_row?([_order_number, _order_placed, name, _total, url]) do
|
||||
String.trim(name) != "" && String.contains?(url, "gp/product")
|
||||
end
|
||||
end
|
2
mix.exs
2
mix.exs
|
@ -21,6 +21,8 @@ defmodule AmazonHistory.MixProject do
|
|||
# Run "mix help deps" to learn about dependencies.
|
||||
defp deps do
|
||||
[
|
||||
{:hound, "~> 1.0"},
|
||||
{:csv, "~> 2.0.0"},
|
||||
# {:dep_from_hexpm, "~> 0.3.0"},
|
||||
# {:dep_from_git, git: "https://github.com/elixir-lang/my_dep.git", tag: "0.1.0"},
|
||||
]
|
||||
|
|
|
@ -0,0 +1,14 @@
|
|||
%{
|
||||
"certifi": {:hex, :certifi, "2.3.1", "d0f424232390bf47d82da8478022301c561cf6445b5b5fb6a84d49a9e76d2639", [:rebar3], [{:parse_trans, "3.2.0", [hex: :parse_trans, repo: "hexpm", optional: false]}], "hexpm"},
|
||||
"csv": {:hex, :csv, "2.0.0", "c66fea89ba7862b94901baf0871285e9b73cad89c5fdb57a6386d2adcf29593e", [:mix], [{:parallel_stream, "~> 1.0.4", [hex: :parallel_stream, repo: "hexpm", optional: false]}], "hexpm"},
|
||||
"hackney": {:hex, :hackney, "1.12.1", "8bf2d0e11e722e533903fe126e14d6e7e94d9b7983ced595b75f532e04b7fdc7", [:rebar3], [{:certifi, "2.3.1", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "5.1.1", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "1.0.1", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "1.0.2", [hex: :mimerl, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "1.1.1", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}], "hexpm"},
|
||||
"hound": {:hex, :hound, "1.0.4", "31db3c013f0ed321b5eb4c573bf3fbc0b74e12fc8da134f9f616527bf0906431", [:mix], [{:hackney, "~> 1.5", [hex: :hackney, repo: "hexpm", optional: false]}, {:poison, ">= 1.4.0", [hex: :poison, repo: "hexpm", optional: false]}], "hexpm"},
|
||||
"idna": {:hex, :idna, "5.1.1", "cbc3b2fa1645113267cc59c760bafa64b2ea0334635ef06dbac8801e42f7279c", [:rebar3], [{:unicode_util_compat, "0.3.1", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm"},
|
||||
"metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], [], "hexpm"},
|
||||
"mimerl": {:hex, :mimerl, "1.0.2", "993f9b0e084083405ed8252b99460c4f0563e41729ab42d9074fd5e52439be88", [:rebar3], [], "hexpm"},
|
||||
"parallel_stream": {:hex, :parallel_stream, "1.0.6", "b967be2b23f0f6787fab7ed681b4c45a215a81481fb62b01a5b750fa8f30f76c", [:mix], [], "hexpm"},
|
||||
"parse_trans": {:hex, :parse_trans, "3.2.0", "2adfa4daf80c14dc36f522cf190eb5c4ee3e28008fc6394397c16f62a26258c2", [:rebar3], [], "hexpm"},
|
||||
"poison": {:hex, :poison, "3.1.0", "d9eb636610e096f86f25d9a46f35a9facac35609a7591b3be3326e99a0484665", [:mix], [], "hexpm"},
|
||||
"ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.1", "28a4d65b7f59893bc2c7de786dec1e1555bd742d336043fe644ae956c3497fbe", [:make, :rebar], [], "hexpm"},
|
||||
"unicode_util_compat": {:hex, :unicode_util_compat, "0.3.1", "a1f612a7b512638634a603c8f401892afbf99b8ce93a45041f8aaca99cadb85e", [:rebar3], [], "hexpm"},
|
||||
}
|
|
@ -9,9 +9,9 @@ defmodule CliTest do
|
|||
assert parse_args(["--help"]) == :help
|
||||
end
|
||||
|
||||
test ":help returned by passing only a username" do
|
||||
assert parse_args(["-u", "rob"]) == :help
|
||||
assert parse_args(["--username", "rob"]) == :help
|
||||
test ":help returned by passing only a email" do
|
||||
assert parse_args(["-e", "rob@netflux.io"]) == :help
|
||||
assert parse_args(["--email", "rob@netflux.io"]) == :help
|
||||
end
|
||||
|
||||
test ":help returned by passing only a password" do
|
||||
|
@ -20,10 +20,10 @@ defmodule CliTest do
|
|||
end
|
||||
|
||||
test "arguments returned by passing valid parameters" do
|
||||
assert parse_args(["-u", "rob", "-p", "hackme"]) == [username: "rob", password: "hackme"]
|
||||
assert parse_args(["-e", "rob@netflux.io", "-p", "hackme"]) == [email: "rob@netflux.io", password: "hackme"]
|
||||
|
||||
assert parse_args(["--username", "rob", "--password", "hackme"]) == [
|
||||
username: "rob",
|
||||
assert parse_args(["--email", "rob@netflux.io", "--password", "hackme"]) == [
|
||||
email: "rob@netflux.io",
|
||||
password: "hackme"
|
||||
]
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue