ソースを参照

[core] scraping

Innocenty Enikeew 7 年 前
コミット
9b043ee577
4 ファイル変更141 行追加0 行削除
  1. 1 0
      chatikbot.asd
  2. 1 0
      plugins/shopper.lisp
  3. 56 0
      plugins/tescort.lisp
  4. 83 0
      scraping.lisp

+ 1 - 0
chatikbot.asd

@@ -38,4 +38,5 @@
                (:file "chat-cron")
                (:file "server")
                (:file "common")
+               (:file "scraping")
                (:file "chatikbot")))

+ 1 - 0
plugins/shopper.lisp

@@ -145,6 +145,7 @@
                                   (agets result :link)))))))
 
 (defun handle-search (chat-id query)
+  (telegram-send-chat-action chat-id "typing")
   (let ((results (merge-search-results
                   (search-shops query +shops+ 3))))
     (bot-send-message chat-id (if results (format-merged results) "Ничего не нашлось")

+ 56 - 0
plugins/tescort.lisp

@@ -0,0 +1,56 @@
+(in-package :cl-user)
+(defpackage chatikbot.plugins.tescort
+  (:use :cl :chatikbot.common :chatikbot.scraping))
+(in-package :chatikbot.plugins.tescort)
+
+(defparameter +login-form+
+  '(("http://www.tescort.com/private/signin")
+    ".signin-box form"
+    (("username" . login)
+     ("password" . pass))))
+
+(defparameter +blacklist-search+
+  '(("http://www.tescort.com/panel/client-blacklist" :parameters (("client_criterias" . q)
+                                                                  ("search" . "search")))
+    ".black_list_table tbody tr"
+    ((:date . ("td:nth-of-type(1)"))
+     (:name . ("td:nth-of-type(2)"))
+     (:city . ("td:nth-of-type(3)"))
+     (:phone . ("td:nth-of-type(4)"))
+     (:comment . ("td:nth-of-type(5)")))))
+
+(defvar *login*)
+(defvar *pass*)
+(defun login ()
+  (let ((dom (do-form +login-form+ `((login . ,*login*) (pass . ,*pass*)))))
+    (when (zerop (length (clss:select "#header .sitemenu-logged" dom)))
+      (error "bad password"))
+    dom))
+
+(defvar *tescort-cookie-jar* (cl-cookie:make-cookie-jar))
+(defun search-blacklist (query)
+  (let ((*cookie-jar* *tescort-cookie-jar*))
+    (unless (have-cookies (car +blacklist-search+))
+      (login))
+    (scrape-list +blacklist-search+ `((q . ,query)))))
+
+(defun format-blacklist (blacklist)
+  (with-output-to-string (s)
+    (loop for item in blacklist
+       do (format s "~A, ~A: ~A *~A*~%~A~%~%"
+                  (agets item :date) (agets item :city)
+                  (agets item :name) (agets item :phone)
+                  (agets item :comment)))))
+
+(defun handle-blacklist (chat-id query)
+  (telegram-send-chat-action chat-id "typing")
+  (let ((results (search-blacklist query)))
+    (bot-send-message chat-id (if results (format-blacklist results) "Not found")
+                      :parse-mode "markdown" :disable-web-preview "true")))
+
+(def-message-cmd-handler handler-cmd-blacklist (:blacklist)
+  (cond
+    ((null args) (bot-send-message chat-id "Enter query")
+     (on-next-message chat-id
+       (handle-blacklist chat-id text)))
+    (:otherwise (handle-blacklist chat-id (spaced args)))))

+ 83 - 0
scraping.lisp

@@ -0,0 +1,83 @@
+(in-package :cl-user)
+(defpackage chatikbot.scraping
+  (:use :cl :chatikbot.common)
+  (:export :*cookie-jar*
+           :*current-dom*
+           :*current-status*
+           :*current-headers*
+           :*current-uri*
+           :do-request
+           :do-form
+           :have-cookies
+           :with-request
+           :scrape-list))
+(in-package :chatikbot.scraping)
+
+(defvar *cookie-jar* (cl-cookie:make-cookie-jar) "Currently active cookies")
+(defvar *current-dom* nil "Currently active request dom")
+(defvar *current-status* nil "Currently active request status")
+(defvar *current-headers* nil "Currently active request headers")
+(defvar *current-uri* nil "Currently active request uri")
+
+(defun apply-context (template context)
+  (sublis context template))
+
+(defun do-request (template &optional context)
+  (let ((args (apply-context template context)))
+    (apply 'xml-request `(,@args :cookie-jar ,*cookie-jar*))))
+
+(defmacro with-request ((template &optional context) &body body)
+  `(multiple-value-bind (*current-dom* *current-status* *current-headers* *current-uri*)
+       (do-request ,template ,context)
+     (declare (ignorable *current-dom* *current-status* *current-headers* *current-uri*))
+     ,@body))
+
+(defun node-full-url (node attr &optional (uri *current-uri*))
+  (quri:render-uri (quri:merge-uris (quri:uri (plump:attribute node attr)) uri)))
+
+(defun do-form (form-info context)
+  (destructuring-bind (request selector template) form-info
+    (with-request (request context)
+      (let ((form (clss:select selector *current-dom*)))
+        (unless (zerop (length form))
+          (let* ((form (elt form 0))
+                 (action (node-full-url form "action"))
+                 (method (or (keyify (plump:attribute form "method")) :get))
+                 (content (append (apply-context template context)
+                                  (loop for input across (clss:select "input,select" form)
+                                     for name = (plump:attribute input "name")
+                                     for value = (plump:attribute input "value")
+                                     when name
+                                     unless (agets template name)
+                                     collect (cons name value)))))
+            (do-request `(,action :method ,method :content ,content))))))))
+
+(defun have-cookies (request)
+  (let ((uri (quri:uri (car request))))
+    (cl-cookie:cookie-jar-host-cookies
+     *cookie-jar* (quri:uri-host uri) (or (quri:uri-path uri) "/")
+     :securep (string= (quri:uri-scheme uri) "https"))))
+
+(defun extract-from-node (node template &optional (uri *current-uri*))
+  (destructuring-bind (selector &optional post) template
+    (let* ((nodes (clss:select selector node))
+           (node (and (not (zerop (length nodes)))
+                      (elt nodes 0))))
+      (when node
+        (cond
+          ((eq :href post) (node-full-url node "href" uri))
+          ((stringp post) (plump:attribute node post))
+          (t (unspacify (text-with-cdata node))))))))
+
+(defun extract-info (items-selector info-templates &key limit (dom *current-dom*) (uri *current-uri*))
+  (let ((nodes (clss:select items-selector dom)))
+    (when limit
+      (setf nodes (subseq nodes 0 (min limit (length nodes)))))
+    (loop for node across nodes
+       collect (loop for (key . template) in info-templates
+                  collect (cons key (extract-from-node node template uri))))))
+
+(defun scrape-list (info &optional context)
+  (destructuring-bind (request item-selector item-info) info
+    (with-request (request context)
+      (extract-info item-selector item-info))))