1
0

scraping.lisp 3.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283
  1. (in-package :cl-user)
  2. (defpackage chatikbot.scraping
  3. (:use :cl :chatikbot.common)
  4. (:export :*cookie-jar*
  5. :*current-dom*
  6. :*current-status*
  7. :*current-headers*
  8. :*current-uri*
  9. :do-request
  10. :do-form
  11. :have-cookies
  12. :with-request
  13. :scrape-list))
  14. (in-package :chatikbot.scraping)
  15. (defvar *cookie-jar* (cl-cookie:make-cookie-jar) "Currently active cookies")
  16. (defvar *current-dom* nil "Currently active request dom")
  17. (defvar *current-status* nil "Currently active request status")
  18. (defvar *current-headers* nil "Currently active request headers")
  19. (defvar *current-uri* nil "Currently active request uri")
  20. (defun apply-context (template context)
  21. (sublis context template))
  22. (defun do-request (template &optional context)
  23. (let ((args (apply-context template context)))
  24. (apply 'xml-request `(,@args :cookie-jar ,*cookie-jar*))))
  25. (defmacro with-request ((template &optional context) &body body)
  26. `(multiple-value-bind (*current-dom* *current-status* *current-headers* *current-uri*)
  27. (do-request ,template ,context)
  28. (declare (ignorable *current-dom* *current-status* *current-headers* *current-uri*))
  29. ,@body))
  30. (defun node-full-url (node attr &optional (uri *current-uri*))
  31. (quri:render-uri (quri:merge-uris (quri:uri (plump:attribute node attr)) uri)))
  32. (defun do-form (form-info context)
  33. (destructuring-bind (request selector template) form-info
  34. (with-request (request context)
  35. (let ((form (clss:select selector *current-dom*)))
  36. (unless (zerop (length form))
  37. (let* ((form (elt form 0))
  38. (action (node-full-url form "action"))
  39. (method (or (keyify (plump:attribute form "method")) :get))
  40. (content (append (apply-context template context)
  41. (loop for input across (clss:select "input,select" form)
  42. for name = (plump:attribute input "name")
  43. for value = (plump:attribute input "value")
  44. when name
  45. unless (agets template name)
  46. collect (cons name value)))))
  47. (do-request `(,action :method ,method :content ,content))))))))
  48. (defun have-cookies (request)
  49. (let ((uri (quri:uri (car request))))
  50. (cl-cookie:cookie-jar-host-cookies
  51. *cookie-jar* (quri:uri-host uri) (or (quri:uri-path uri) "/")
  52. :securep (string= (quri:uri-scheme uri) "https"))))
  53. (defun extract-from-node (node template &optional (uri *current-uri*))
  54. (destructuring-bind (selector &optional post) template
  55. (let* ((nodes (clss:select selector node))
  56. (node (and (not (zerop (length nodes)))
  57. (elt nodes 0))))
  58. (when node
  59. (cond
  60. ((eq :href post) (node-full-url node "href" uri))
  61. ((stringp post) (plump:attribute node post))
  62. (t (unspacify (text-with-cdata node))))))))
  63. (defun extract-info (items-selector info-templates &key limit (dom *current-dom*) (uri *current-uri*))
  64. (let ((nodes (clss:select items-selector dom)))
  65. (when limit
  66. (setf nodes (subseq nodes 0 (min limit (length nodes)))))
  67. (loop for node across nodes
  68. collect (loop for (key . template) in info-templates
  69. collect (cons key (extract-from-node node template uri))))))
  70. (defun scrape-list (info &optional context)
  71. (destructuring-bind (request item-selector item-info) info
  72. (with-request (request context)
  73. (extract-info item-selector item-info))))