Setup feeds, request, and filter them
This commit is contained in:
parent
657975324a
commit
70e78bcf42
@ -4,6 +4,8 @@ require_once __DIR__ . '/../vendor/autoload.php';
|
|||||||
use Doctrine\ORM\EntityManager;
|
use Doctrine\ORM\EntityManager;
|
||||||
use Doctrine\ORM\Tools\Console\ConsoleRunner;
|
use Doctrine\ORM\Tools\Console\ConsoleRunner;
|
||||||
use Doctrine\ORM\Tools\Console\EntityManagerProvider\SingleManagerProvider;
|
use Doctrine\ORM\Tools\Console\EntityManagerProvider\SingleManagerProvider;
|
||||||
|
use Lewisdale\App\Models\Repositories\FeedRepository;
|
||||||
|
use Lewisdale\App\Tools\Console\TestFeed;
|
||||||
|
|
||||||
$dotenv = Dotenv\Dotenv::createImmutable([__DIR__, __DIR__ . "/.."]);
|
$dotenv = Dotenv\Dotenv::createImmutable([__DIR__, __DIR__ . "/.."]);
|
||||||
$dotenv->load();
|
$dotenv->load();
|
||||||
@ -12,6 +14,11 @@ $dotenv->load();
|
|||||||
require_once __DIR__ . '/../src/dependencies.php';
|
require_once __DIR__ . '/../src/dependencies.php';
|
||||||
global $container;
|
global $container;
|
||||||
|
|
||||||
|
$commands = [
|
||||||
|
$container->get(TestFeed::class),
|
||||||
|
];
|
||||||
|
|
||||||
ConsoleRunner::run(
|
ConsoleRunner::run(
|
||||||
new SingleManagerProvider($container->get(EntityManager::class))
|
new SingleManagerProvider($container->get(EntityManager::class)),
|
||||||
|
$commands
|
||||||
);
|
);
|
@ -20,7 +20,10 @@
|
|||||||
"symfony/dom-crawler": "^6.3",
|
"symfony/dom-crawler": "^6.3",
|
||||||
"symfony/css-selector": "^6.3",
|
"symfony/css-selector": "^6.3",
|
||||||
"symfony/http-client": "^6.3",
|
"symfony/http-client": "^6.3",
|
||||||
"league/uri": "^6.8"
|
"league/uri": "^6.8",
|
||||||
|
"ext-simplexml": "*",
|
||||||
|
"ext-curl": "*",
|
||||||
|
"ext-dom": "*"
|
||||||
},
|
},
|
||||||
"require-dev": {
|
"require-dev": {
|
||||||
"phpunit/phpunit": "^10.0"
|
"phpunit/phpunit": "^10.0"
|
||||||
|
@ -26,7 +26,7 @@ class FeedController
|
|||||||
return $this->view->render($response, 'index.twig.html', ['feeds' => $feeds]);
|
return $this->view->render($response, 'index.twig.html', ['feeds' => $feeds]);
|
||||||
}
|
}
|
||||||
|
|
||||||
public function create(ServerRequestInterface $request, ResponseInterface $response)
|
public function create(ServerRequestInterface $request, ResponseInterface $response): ResponseInterface
|
||||||
{
|
{
|
||||||
$this->logger->info("FeedController::create() called");
|
$this->logger->info("FeedController::create() called");
|
||||||
$body = $request->getParsedBody();
|
$body = $request->getParsedBody();
|
||||||
|
@ -4,9 +4,16 @@ declare(strict_types=1);
|
|||||||
|
|
||||||
namespace Lewisdale\App\Models\Data;
|
namespace Lewisdale\App\Models\Data;
|
||||||
|
|
||||||
|
use Doctrine\Common\Collections\Collection;
|
||||||
|
use Doctrine\DBAL\Types\Types;
|
||||||
use Doctrine\ORM\Mapping as ORM;
|
use Doctrine\ORM\Mapping as ORM;
|
||||||
|
use DOMXPath;
|
||||||
|
use Lewisdale\App\Models\Types\XMLElement;
|
||||||
|
use Lewisdale\App\Requests\Robots;
|
||||||
use Ramsey\Uuid\Doctrine\UuidGenerator;
|
use Ramsey\Uuid\Doctrine\UuidGenerator;
|
||||||
use Ramsey\Uuid\UuidInterface;
|
use Ramsey\Uuid\UuidInterface;
|
||||||
|
use SimpleXMLElement;
|
||||||
|
use function Symfony\Component\String\s;
|
||||||
|
|
||||||
#[ORM\Entity]
|
#[ORM\Entity]
|
||||||
#[ORM\Table(name: 'feeds')]
|
#[ORM\Table(name: 'feeds')]
|
||||||
@ -24,4 +31,58 @@ class Feed
|
|||||||
|
|
||||||
#[ORM\Column(type: 'string')]
|
#[ORM\Column(type: 'string')]
|
||||||
public string $title;
|
public string $title;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @var Collection<FeedFilter>
|
||||||
|
*/
|
||||||
|
#[ORM\OneToMany(mappedBy: 'feed', targetEntity: FeedFilter::class, cascade: ['persist', 'remove'])]
|
||||||
|
public Collection $feedFilters;
|
||||||
|
|
||||||
|
#[ORM\Column(type: XMLElement::NAME, nullable: true)]
|
||||||
|
public SimpleXMLElement|null $remoteFeed;
|
||||||
|
|
||||||
|
#[ORM\Column(type: XMLElement::NAME, nullable: true)]
|
||||||
|
public SimpleXMLElement|null $filteredFeed;
|
||||||
|
|
||||||
|
public function fetch(): void
|
||||||
|
{
|
||||||
|
if (Robots::allowed($this->url)) {
|
||||||
|
$this->remoteFeed = simplexml_load_file($this->url);
|
||||||
|
} else {
|
||||||
|
throw new \Exception("Robots.txt disallows fetching this feed");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private function test_item(\DOMNode $item): bool {
|
||||||
|
foreach ($this->feedFilters as $filter) {
|
||||||
|
if (!$filter->execute(simplexml_import_dom($item))) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function filter() : void
|
||||||
|
{
|
||||||
|
if ($this->remoteFeed === null) {
|
||||||
|
$this->fetch();
|
||||||
|
}
|
||||||
|
|
||||||
|
$dom = dom_import_simplexml($this->remoteFeed);
|
||||||
|
$doc = $dom->ownerDocument;
|
||||||
|
|
||||||
|
$xpath = new DOMXPath($doc);
|
||||||
|
$filter_queries = implode("|", array_map(fn($filter) => $filter->to_xpath(), $this->feedFilters->toArray()));
|
||||||
|
|
||||||
|
$xpath_query = '//item[' . $filter_queries . ']';
|
||||||
|
|
||||||
|
$to_remove = $xpath->query($xpath_query);
|
||||||
|
echo "Removing " . $to_remove->length . " items\n";
|
||||||
|
foreach ($to_remove as $item) {
|
||||||
|
$item->parentNode->removeChild($item);
|
||||||
|
}
|
||||||
|
|
||||||
|
$this->filteredFeed = simplexml_import_dom($dom);
|
||||||
|
}
|
||||||
}
|
}
|
@ -13,6 +13,24 @@ use Ramsey\Uuid\UuidInterface;
|
|||||||
#[ORM\HasLifecycleCallbacks]
|
#[ORM\HasLifecycleCallbacks]
|
||||||
class FeedFilter
|
class FeedFilter
|
||||||
{
|
{
|
||||||
|
public function __construct(
|
||||||
|
FilterTarget $target,
|
||||||
|
FilterType $filter,
|
||||||
|
string $value,
|
||||||
|
Feed $feed,
|
||||||
|
UuidInterface $id = null,
|
||||||
|
)
|
||||||
|
{
|
||||||
|
if ($id !== null) {
|
||||||
|
$this->id = $id;
|
||||||
|
}
|
||||||
|
|
||||||
|
$this->target = $target;
|
||||||
|
$this->filter = $filter;
|
||||||
|
$this->value = $value;
|
||||||
|
$this->feed = $feed;
|
||||||
|
}
|
||||||
|
|
||||||
#[ORM\Id]
|
#[ORM\Id]
|
||||||
#[ORM\Column(type: "uuid", unique: true)]
|
#[ORM\Column(type: "uuid", unique: true)]
|
||||||
#[ORM\GeneratedValue(strategy: "CUSTOM")]
|
#[ORM\GeneratedValue(strategy: "CUSTOM")]
|
||||||
@ -33,7 +51,30 @@ class FeedFilter
|
|||||||
#[ORM\Column(type: 'string')]
|
#[ORM\Column(type: 'string')]
|
||||||
public string $value;
|
public string $value;
|
||||||
|
|
||||||
#[ORM\ManyToOne(targetEntity: Feed::class)]
|
#[ORM\ManyToOne(targetEntity: Feed::class, cascade: ['persist', 'remove'], inversedBy: 'feedFilters')]
|
||||||
#[ORM\JoinColumn(name: 'feed_id', referencedColumnName: 'id')]
|
#[ORM\JoinColumn(name: 'feed_id', referencedColumnName: 'id')]
|
||||||
public Feed $feed;
|
public Feed $feed;
|
||||||
|
|
||||||
|
public function to_xpath(): string {
|
||||||
|
return match ($this->filter) {
|
||||||
|
FilterType::EXACT => "{$this->target->value}='{$this->value}'",
|
||||||
|
FilterType::INCLUDE => "{$this->target->value}[not(contains(text(), \"{$this->value}\"))]",
|
||||||
|
FilterType::EXCLUDE => "{$this->target->value}[contains(text(), \"{$this->value}\")]",
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
public function execute(\SimpleXMLElement $item): bool {
|
||||||
|
$value = (string) match ($this->target) {
|
||||||
|
FilterTarget::TITLE => $item->title,
|
||||||
|
FilterTarget::DESCRIPTION => $item->description,
|
||||||
|
FilterTarget::LINK => $item->link,
|
||||||
|
};
|
||||||
|
|
||||||
|
return match ($this->filter) {
|
||||||
|
FilterType::INCLUDE => str_contains(strtolower($value), strtolower($this->value)),
|
||||||
|
FilterType::EXACT => $value === $this->value,
|
||||||
|
FilterType::REGEX => preg_match($this->value, $value),
|
||||||
|
FilterType::EXCLUDE => !str_contains($value, $this->value),
|
||||||
|
};
|
||||||
|
}
|
||||||
}
|
}
|
40
src/Models/Types/XMLElement.php
Normal file
40
src/Models/Types/XMLElement.php
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
<?php
|
||||||
|
declare(strict_types=1);
|
||||||
|
|
||||||
|
|
||||||
|
namespace Lewisdale\App\Models\Types;
|
||||||
|
|
||||||
|
use Doctrine\DBAL\Platforms\AbstractPlatform;
|
||||||
|
use Doctrine\DBAL\Types\Type;
|
||||||
|
|
||||||
|
class XMLElement extends Type
|
||||||
|
{
|
||||||
|
const NAME = 'xml_element';
|
||||||
|
|
||||||
|
public function getSQLDeclaration(array $column, AbstractPlatform $platform): string
|
||||||
|
{
|
||||||
|
return 'TEXT';
|
||||||
|
}
|
||||||
|
|
||||||
|
public function convertToPHPValue($value, AbstractPlatform $platform)
|
||||||
|
{
|
||||||
|
if ($value === null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return simplexml_load_string($value);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function convertToDatabaseValue($value, AbstractPlatform $platform)
|
||||||
|
{
|
||||||
|
if ($value instanceof \SimpleXMLElement) {
|
||||||
|
return $value->asXML();
|
||||||
|
} else {
|
||||||
|
return $value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getName(): string
|
||||||
|
{
|
||||||
|
return self::NAME;
|
||||||
|
}
|
||||||
|
}
|
89
src/Requests/Robots.php
Normal file
89
src/Requests/Robots.php
Normal file
@ -0,0 +1,89 @@
|
|||||||
|
<?php
|
||||||
|
declare(strict_types=1);
|
||||||
|
|
||||||
|
|
||||||
|
namespace Lewisdale\App\Requests;
|
||||||
|
|
||||||
|
class Robots
|
||||||
|
{
|
||||||
|
// Original PHP code by Chirp Internet: www.chirpinternet.eu
|
||||||
|
// Adapted to include 404 and Allow directive checking by Eric at LinkUp.com
|
||||||
|
// Please acknowledge use of this code by including this header.
|
||||||
|
public static function allowed(string $url, string | null $useragent = "Baleen"): bool
|
||||||
|
{
|
||||||
|
// parse url to retrieve host and path
|
||||||
|
$parsed = parse_url($url);
|
||||||
|
|
||||||
|
$agents = [preg_quote('*')];
|
||||||
|
if ($useragent) {
|
||||||
|
$agents[] = preg_quote($useragent, '/');
|
||||||
|
}
|
||||||
|
$agents = implode('|', $agents);
|
||||||
|
|
||||||
|
// location of robots.txt file, only pay attention to it if the server says it exists
|
||||||
|
if (function_exists('curl_init')) {
|
||||||
|
$handle = curl_init("http://{$parsed['host']}/robots.txt");
|
||||||
|
curl_setopt($handle, CURLOPT_RETURNTRANSFER, TRUE);
|
||||||
|
$response = curl_exec($handle);
|
||||||
|
$httpCode = curl_getinfo($handle, CURLINFO_HTTP_CODE);
|
||||||
|
if (200 == $httpCode) {
|
||||||
|
$robots_txt = explode("\n", $response);
|
||||||
|
} else {
|
||||||
|
$robots_txt = FALSE;
|
||||||
|
}
|
||||||
|
curl_close($handle);
|
||||||
|
} else {
|
||||||
|
$robots_txt = @file("http://{$parsed['host']}/robots.txt");
|
||||||
|
}
|
||||||
|
|
||||||
|
// if there isn't a robots, then we're allowed in
|
||||||
|
if (empty($robots_txt)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
$rules = [];
|
||||||
|
$rule_applies = FALSE;
|
||||||
|
|
||||||
|
foreach ($robots_txt as $line) {
|
||||||
|
// skip blank lines
|
||||||
|
if (!$line = trim($line)) continue;
|
||||||
|
|
||||||
|
// following rules only apply if User-agent matches $useragent or '*'
|
||||||
|
if (preg_match('/^\s*User-agent: (.*)/i', $line, $match)) {
|
||||||
|
$rule_applies = preg_match("/($agents)/i", $match[1]);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($rule_applies) {
|
||||||
|
list($type, $rule) = explode(':', $line, 2);
|
||||||
|
$type = trim(strtolower($type));
|
||||||
|
// add rules that apply to array for testing
|
||||||
|
$rules[] = [
|
||||||
|
'type' => $type,
|
||||||
|
'match' => preg_quote(trim($rule), '/'),
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
$isAllowed = TRUE;
|
||||||
|
$currentStrength = 0;
|
||||||
|
|
||||||
|
foreach ($rules as $rule) {
|
||||||
|
// check if page hits on a rule
|
||||||
|
if (preg_match("/^{$rule['match']}/", $parsed['path'])) {
|
||||||
|
// prefer longer (more specific) rules and Allow trumps Disallow if rules same length
|
||||||
|
$strength = strlen($rule['match']);
|
||||||
|
if ($currentStrength < $strength) {
|
||||||
|
$currentStrength = $strength;
|
||||||
|
$isAllowed = ("allow" == $rule['type']);
|
||||||
|
} elseif ($currentStrength == $strength && ("allow" == $rule['type'])) {
|
||||||
|
$currentStrength = $strength;
|
||||||
|
$isAllowed = TRUE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return $isAllowed;
|
||||||
|
}
|
||||||
|
}
|
79
src/Tools/Console/TestFeed.php
Normal file
79
src/Tools/Console/TestFeed.php
Normal file
@ -0,0 +1,79 @@
|
|||||||
|
<?php
|
||||||
|
declare(strict_types=1);
|
||||||
|
|
||||||
|
|
||||||
|
namespace Lewisdale\App\Tools\Console;
|
||||||
|
|
||||||
|
use Symfony\Component\Console\{Attribute\AsCommand,
|
||||||
|
Command\Command,
|
||||||
|
Input\InputArgument,
|
||||||
|
Input\InputInterface,
|
||||||
|
Output\OutputInterface};
|
||||||
|
use Doctrine\Common\Collections\ArrayCollection;
|
||||||
|
use Doctrine\ORM\EntityManager;
|
||||||
|
use Lewisdale\App\Models\Data\Feed;
|
||||||
|
use Lewisdale\App\Models\Data\FeedFilter;
|
||||||
|
use Lewisdale\App\Models\Data\FilterTarget;
|
||||||
|
use Lewisdale\App\Models\Data\FilterType;
|
||||||
|
use Lewisdale\App\Models\Repositories\FeedRepository;
|
||||||
|
use function DI\add;
|
||||||
|
|
||||||
|
#[AsCommand(name: "test:feed")]
|
||||||
|
class TestFeed extends Command
|
||||||
|
{
|
||||||
|
public function __construct(private readonly EntityManager $em,
|
||||||
|
private readonly FeedRepository $feedRepository)
|
||||||
|
{
|
||||||
|
parent::__construct();
|
||||||
|
}
|
||||||
|
|
||||||
|
protected function configure(): void
|
||||||
|
{
|
||||||
|
$this
|
||||||
|
->addArgument("url", InputArgument::REQUIRED, "The URL of the feed to test")
|
||||||
|
->addArgument("title", InputArgument::REQUIRED, "The title of the feed to test");
|
||||||
|
}
|
||||||
|
|
||||||
|
protected function execute(InputInterface $input, OutputInterface $output): int
|
||||||
|
{
|
||||||
|
$output->writeln([
|
||||||
|
"Testing Feed",
|
||||||
|
"============",
|
||||||
|
]);
|
||||||
|
|
||||||
|
$url = $input->getArgument("url");
|
||||||
|
$title = $input->getArgument("title");
|
||||||
|
|
||||||
|
// $output->writeln("Setting up a new feed for: $url");
|
||||||
|
//
|
||||||
|
// $feed = new Feed();
|
||||||
|
// $feed->url = $url;
|
||||||
|
// $feed->title = $title;
|
||||||
|
// $feed->fetch();
|
||||||
|
// $feed->feedFilters = new ArrayCollection();
|
||||||
|
// $feed->feedFilters->add( new FeedFilter(FilterTarget::TITLE, FilterType::INCLUDE, "[No Ads]", $feed));
|
||||||
|
//
|
||||||
|
// $this->em->persist($feed);
|
||||||
|
// $this->em->flush();
|
||||||
|
//
|
||||||
|
// $cnt = $this->feedRepository->count([]);
|
||||||
|
// $output->writeln("Feed count: $cnt");
|
||||||
|
|
||||||
|
$saved_feed = $this->feedRepository->findOneBy(['url' => $url, 'title' => $title]);
|
||||||
|
$output->writeln("Feed title: " . $saved_feed->title);
|
||||||
|
|
||||||
|
$s2_filter = new FeedFilter(FilterTarget::TITLE, FilterType::EXCLUDE, "S2", $saved_feed);
|
||||||
|
$this->em->persist($s2_filter);
|
||||||
|
$this->em->flush();
|
||||||
|
|
||||||
|
$saved_feed = $this->feedRepository->findOneBy(['url' => $url, 'title' => $title]);
|
||||||
|
$saved_feed->filter();
|
||||||
|
|
||||||
|
$this->em->persist($saved_feed);
|
||||||
|
$this->em->flush();
|
||||||
|
|
||||||
|
$output->writeln("Filtered feed: " . $saved_feed->filteredFeed->asXML('feed_filtered.xml'));
|
||||||
|
|
||||||
|
return Command::SUCCESS;
|
||||||
|
}
|
||||||
|
}
|
@ -4,6 +4,8 @@ use Lewisdale\App\Controllers\FeedController;
|
|||||||
use Lewisdale\App\Controllers\SampleController;
|
use Lewisdale\App\Controllers\SampleController;
|
||||||
use Slim\Views\TwigMiddleware;
|
use Slim\Views\TwigMiddleware;
|
||||||
|
|
||||||
|
ini_set('user_agent', 'Baleen/1.0 (https://lewisdale.dev)');
|
||||||
|
|
||||||
require_once __DIR__ . "/dependencies.php";
|
require_once __DIR__ . "/dependencies.php";
|
||||||
|
|
||||||
global $container;
|
global $container;
|
||||||
|
@ -6,6 +6,7 @@ use Doctrine\DBAL\Types\Type;
|
|||||||
use Doctrine\ORM\EntityManager;
|
use Doctrine\ORM\EntityManager;
|
||||||
use Doctrine\ORM\ORMSetup;
|
use Doctrine\ORM\ORMSetup;
|
||||||
use Lewisdale\App\Logging\FileLogger;
|
use Lewisdale\App\Logging\FileLogger;
|
||||||
|
use Lewisdale\App\Models\Types\XMLElement;
|
||||||
use Lewisdale\App\TwigExtensions\CsrfExtension;
|
use Lewisdale\App\TwigExtensions\CsrfExtension;
|
||||||
use Psr\Log\LoggerInterface;
|
use Psr\Log\LoggerInterface;
|
||||||
use Ramsey\Uuid\Doctrine\UuidType;
|
use Ramsey\Uuid\Doctrine\UuidType;
|
||||||
@ -40,6 +41,7 @@ $container->set(LoggerInterface::class, $container->get(FileLogger::class));
|
|||||||
|
|
||||||
$container->set(EntityManager::class, static function() {
|
$container->set(EntityManager::class, static function() {
|
||||||
Type::addType('uuid', UuidType::class);
|
Type::addType('uuid', UuidType::class);
|
||||||
|
Type::addType(XMLElement::NAME, XMLElement::class);
|
||||||
|
|
||||||
$config = ORMSetup::createAttributeMetadataConfiguration(
|
$config = ORMSetup::createAttributeMetadataConfiguration(
|
||||||
paths: array(__DIR__."/Models/Data"),
|
paths: array(__DIR__."/Models/Data"),
|
||||||
@ -51,6 +53,8 @@ $container->set(EntityManager::class, static function() {
|
|||||||
'path' => __DIR__ . '/../' . getenv("SQLITE_DB_NAME"),
|
'path' => __DIR__ . '/../' . getenv("SQLITE_DB_NAME"),
|
||||||
], $config);
|
], $config);
|
||||||
|
|
||||||
|
$connection->executeQuery("PRAGMA foreign_keys = ON");
|
||||||
|
|
||||||
return new EntityManager($connection, $config);
|
return new EntityManager($connection, $config);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user