| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124 |
- (in-package :cl-user)
- (defpackage chatikbot.scraping
- (:use :cl :chatikbot.common)
- (:export :*cookie-jar*
- :*current-dom*
- :*current-status*
- :*current-headers*
- :*current-uri*
- :do-request
- :do-form
- :have-cookies
- :with-request
- :with-chat-cookies
- :scrape-list))
- (in-package :chatikbot.scraping)
- (defvar *cookie-jar* (cl-cookie:make-cookie-jar) "Currently active cookies")
- (defvar *current-dom* nil "Currently active request dom")
- (defvar *current-status* nil "Currently active request status")
- (defvar *current-headers* nil "Currently active request headers")
- (defvar *current-uri* nil "Currently active request uri")
- (defun apply-context (template context)
- (sublis context template))
- (defun do-request (template &optional context processor)
- (let ((args (apply-context template context)))
- (apply (or processor 'xml-request) `(,@args :cookie-jar ,*cookie-jar*))))
- (defmacro with-request ((template &optional context processor) &Body body)
- `(multiple-value-bind (*current-dom* *current-status* *current-headers* *current-uri*)
- (do-request ,template ,context ,processor)
- (declare (ignorable *current-dom* *current-status* *current-headers* *current-uri*))
- ,@body))
- (defmacro with-chat-cookies ((chat-id chat-cookies) &body body)
- (alexandria:with-gensyms (g-chat g-cookies)
- `(let* ((,g-chat ,chat-id)
- (,g-cookies ,chat-cookies)
- (*cookie-jar* (or (gethash ,g-chat ,g-cookies)
- (setf (gethash ,g-chat ,g-cookies)
- (cl-cookie:make-cookie-jar)))))
- ,@body)))
- (defun node-full-url (node attr &optional (uri *current-uri*))
- (quri:render-uri (quri:merge-uris (quri:uri (plump:attribute node attr)) uri)))
- (defun do-validate (info &optional (dom *current-dom*))
- (cond
- ((null info) t)
- (t (not (zerop (length (clss:select info dom)))))))
- (defun do-form (form-info &optional context validate)
- (when form-info
- (destructuring-bind (request selector template) form-info
- (with-request (request context)
- (let ((form (clss:select selector *current-dom*)))
- (unless (zerop (length form))
- (let* ((form (elt form 0))
- (action (node-full-url form "action"))
- (method (or (keyify (plump:attribute form "method")) :get))
- (content (append (apply-context template context)
- (loop for input across (clss:select "input,select" form)
- for name = (plump:attribute input "name")
- for value = (plump:attribute input "value")
- when name
- unless (agets template name)
- collect (cons name value))))
- (action-request `(,action :method ,method :content ,content)))
- (with-request (action-request)
- (unless (do-validate validate)
- (error "validate error"))
- (values *current-dom* *current-status* *current-headers* *current-uri*)))))))))
- (defun have-cookies (request)
- (let ((uri (quri:uri (car request))))
- (cl-cookie:cookie-jar-host-cookies
- *cookie-jar* (quri:uri-host uri) (or (quri:uri-path uri) "/")
- :securep (string= (quri:uri-scheme uri) "https"))))
- (defun extract-data (node &key attr (uri *current-uri*))
- (when node
- (cond
- ((eq :href attr) (node-full-url node "href" uri))
- ((stringp attr) (plump:attribute node attr))
- (t (unspacify (text-with-cdata node))))))
- (defun extract-info (parent template &optional (uri *current-uri*))
- (destructuring-bind (selector &key child attr multiple) template
- (labels ((extract (n)
- (when child
- (setf n (elt (plump:children n) child)))
- (extract-data n :attr attr :uri uri)))
- (let* ((nodes (clss:select selector parent))
- (data (map 'list #'extract nodes)))
- (if multiple data (car data))))))
- (defun extract-items-info (items-selector info-templates &key limit (dom *current-dom*) (uri *current-uri*))
- (let ((nodes (clss:select items-selector dom)))
- (when limit
- (setf nodes (subseq nodes 0 (min limit (length nodes)))))
- (loop for node across nodes
- collect (loop for (key . template) in info-templates
- collect (cons key (extract-info node template uri))))))
- (defun scrape-list (info &optional context)
- (let* ((scrape (agets info :scrape))
- (request (agets scrape :request))
- (processor (or (agets scrape :processor) #'xml-request))
- (init (agets info :init))
- (validate (agets info :validate))
- (validate-init (or (agets info :validate-init) validate)))
- (labels ((return-info ()
- (extract-items-info (agets scrape :items)
- (agets scrape :info))))
- (unless (have-cookies request)
- (do-form init context validate-init))
- (with-request (request context processor)
- (if (do-validate validate)
- (return-info)
- (progn
- (do-form init context validate-init)
- (with-request (request context)
- (return-info))))))))
|